summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore5
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/async.c66
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c42
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_iter.c16
-rw-r--r--kernel/bpf/bpf_local_storage.c39
-rw-r--r--kernel/bpf/bpf_lsm.c8
-rw-r--r--kernel/bpf/bpf_task_storage.c100
-rw-r--r--kernel/bpf/btf.c325
-rw-r--r--kernel/bpf/core.c54
-rw-r--r--kernel/bpf/cpumap.c27
-rw-r--r--kernel/bpf/devmap.c47
-rw-r--r--kernel/bpf/disasm.c13
-rw-r--r--kernel/bpf/hashtab.c67
-rw-r--r--kernel/bpf/helpers.c335
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/syscall.c31
-rw-r--r--kernel/bpf/trampoline.c4
-rw-r--r--kernel/bpf/verifier.c853
-rw-r--r--kernel/cgroup/cgroup.c34
-rw-r--r--kernel/cgroup/rstat.c63
-rw-r--r--kernel/configs/android-base.config1
-rw-r--r--kernel/configs/tiny-base.config1
-rw-r--r--kernel/cpu.c210
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/dma/direct.c8
-rw-r--r--kernel/dma/direct.h6
-rw-r--r--kernel/dma/map_benchmark.c23
-rw-r--r--kernel/dma/mapping.c148
-rw-r--r--kernel/dma/remap.c1
-rw-r--r--kernel/dma/swiotlb.c520
-rw-r--r--kernel/events/core.c5
-rw-r--r--kernel/exit.c68
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/futex.c82
-rw-r--r--kernel/gcov/Kconfig1
-rw-r--r--kernel/gcov/base.c49
-rw-r--r--kernel/gcov/clang.c276
-rw-r--r--kernel/gcov/fs.c110
-rw-r--r--kernel/gcov/gcc_4_7.c173
-rw-r--r--kernel/gcov/gcov.h14
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/irqdomain.c20
-rw-r--r--kernel/irq_work.c7
-rw-r--r--kernel/kcsan/debugfs.c3
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/locking/qrwlock.c6
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/ptrace.c43
-rw-r--r--kernel/resource.c194
-rw-r--r--kernel/rseq.c29
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c219
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c12
-rw-r--r--kernel/sched/debug.c435
-rw-r--r--kernel/sched/fair.c394
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.c2
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/psi.c200
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h59
-rw-r--r--kernel/sched/stats.c2
-rw-r--r--kernel/sched/stats.h37
-rw-r--r--kernel/sched/topology.c113
-rw-r--r--kernel/seccomp.c17
-rw-r--r--kernel/signal.c59
-rw-r--r--kernel/smp.c238
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c14
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c173
-rw-r--r--kernel/task_work.c38
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timekeeping.c1
-rw-r--r--kernel/trace/bpf_trace.c371
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--kernel/trace/ftrace.c58
-rw-r--r--kernel/trace/ring_buffer.c142
-rw-r--r--kernel/trace/synth_event_gen_test.c2
-rw-r--r--kernel/trace/trace.c361
-rw-r--r--kernel/trace/trace.h35
-rw-r--r--kernel/trace/trace_clock.c44
-rw-r--r--kernel/trace/trace_entries.h22
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c214
-rw-r--r--kernel/trace/trace_events_filter.c18
-rw-r--r--kernel/trace/trace_events_hist.c100
-rw-r--r--kernel/trace/trace_events_synth.c2
-rw-r--r--kernel/trace/trace_events_trigger.c45
-rw-r--r--kernel/trace/trace_functions.c223
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_hwlat.c4
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_output.c93
-rw-r--r--kernel/trace/trace_printk.c11
-rw-r--r--kernel/trace/trace_probe.c6
-rw-r--r--kernel/trace/trace_probe.h2
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_selftest.c4
-rw-r--r--kernel/trace/trace_seq.c12
-rw-r--r--kernel/ucount.c4
-rw-r--r--kernel/umh.c8
-rw-r--r--kernel/up.c42
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/watchdog.c88
121 files changed, 5129 insertions, 3125 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 78701ea37c97..c6b299a6b786 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,4 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-kheaders.md5
-timeconst.h
-hz.bc
+/config_data
+/kheaders.md5
diff --git a/kernel/Makefile b/kernel/Makefile
index e8a6715f38dc..4df609be42d0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -142,10 +142,15 @@ obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
$(obj)/configs.o: $(obj)/config_data.gz
-targets += config_data.gz
-$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
+targets += config_data config_data.gz
+$(obj)/config_data.gz: $(obj)/config_data FORCE
$(call if_changed,gzip)
+filechk_cat = cat $<
+
+$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
+ $(call filechk,cat)
+
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
diff --git a/kernel/async.c b/kernel/async.c
index 33258e6e20f8..b8d7a663497f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
+static long long microseconds_since(ktime_t start)
+{
+ ktime_t now = ktime_get();
+ return ktime_to_ns(ktime_sub(now, start)) >> 10;
+}
+
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct async_entry *first = NULL;
@@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t calltime, delta, rettime;
+ ktime_t calltime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("calling %lli_%pS @ %i\n",
- (long long)entry->cookie,
- entry->func, task_pid_nr(current));
- calltime = ktime_get();
- }
+ pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
+ calltime = ktime_get();
+
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- rettime = ktime_get();
- delta = ktime_sub(rettime, calltime);
- pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
- (long long)entry->cookie,
- entry->func,
- (long long)ktime_to_ns(delta) >> 10);
- }
+
+ pr_debug("initcall %lli_%pS returned after %lld usecs\n",
+ (long long)entry->cookie, entry->func,
+ microseconds_since(calltime));
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
@@ -246,24 +246,6 @@ void async_synchronize_full(void)
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
- *
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
- */
-void async_unregister_domain(struct async_domain *domain)
-{
- spin_lock_irq(&async_lock);
- WARN_ON(!domain->registered || !list_empty(&domain->pending));
- domain->registered = 0;
- spin_unlock_irq(&async_lock);
-}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
-
-/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
* @domain: the domain to synchronize
*
@@ -287,23 +269,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t starttime, delta, endtime;
+ ktime_t starttime;
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("async_waiting @ %i\n", task_pid_nr(current));
- starttime = ktime_get();
- }
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+ starttime = ktime_get();
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- endtime = ktime_get();
- delta = ktime_sub(endtime, starttime);
-
- pr_debug("async_continuing @ %i after %lli usec\n",
- task_pid_nr(current),
- (long long)ktime_to_ns(delta) >> 10);
- }
+ pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
+ microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index d1249340fd6b..7f33098ca63f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
-obj-${CONFIG_BPF_LSM} += bpf_task_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1f8453343bf2..3c4105603f9d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
+static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ u32 i, key, num_elems = 0;
+ struct bpf_array *array;
+ bool is_percpu;
+ u64 ret = 0;
+ void *val;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ array = container_of(map, struct bpf_array, map);
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < map->max_entries; i++) {
+ if (is_percpu)
+ val = this_cpu_ptr(array->pptrs[i]);
+ else
+ val = array->value + array->elem_size * i;
+ num_elems++;
+ key = i;
+ ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+ (u64)(long)&key, (u64)(long)val,
+ (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ break;
+ }
+
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
static int array_map_btf_id;
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
@@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = {
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &array_map_btf_id,
.iter_seq_info = &iter_seq_info,
@@ -660,6 +698,10 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_delete_elem = array_map_delete_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &percpu_array_map_btf_id,
.iter_seq_info = &iter_seq_info,
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index b58b2efb9b43..2921ca39a93e 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, NULL);
}
static int inode_storage_map_btf_id;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index a0d9eade9c80..931870f9cf56 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
*/
return ret == 0 ? 0 : -EAGAIN;
}
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+ void *, callback_ctx, u64, flags)
+{
+ return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+ .func = bpf_for_each_map_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index dd5aedee99e7..b305270b7a4b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
+ unsigned long flags;
if (unlikely(!selem_linked_to_storage(selem)))
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference(selem->local_storage);
- raw_spin_lock_bh(&local_storage->lock);
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true);
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
kfree_rcu(local_storage, rcu);
@@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
if (unlikely(!selem_linked_to_map(selem)))
/* selem has already be unlinked from smap */
@@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
smap = rcu_dereference(SDATA(selem)->smap);
b = select_bucket(smap, selem);
- raw_spin_lock_bh(&b->lock);
+ raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_bh(&b->lock);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
}
void bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+ unsigned long flags;
- raw_spin_lock_bh(&b->lock);
+ raw_spin_lock_irqsave(&b->lock, flags);
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_bh(&b->lock);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
}
void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
@@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
sdata = SDATA(selem);
if (cacheit_lockit) {
+ unsigned long flags;
+
/* spinlock is needed to avoid racing with the
* parallel delete. Otherwise, publishing an already
* deleted sdata to the cache will become a use-after-free
* problem in the next bpf_local_storage_lookup().
*/
- raw_spin_lock_bh(&local_storage->lock);
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
if (selem_linked_to_storage(selem))
rcu_assign_pointer(local_storage->cache[smap->cache_idx],
sdata);
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
}
return sdata;
@@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
+ unsigned long flags;
int err;
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
- raw_spin_lock_bh(&local_storage->lock);
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
if (unlikely(hlist_empty(&local_storage->list))) {
@@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
unlock:
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return SDATA(selem);
unlock_err:
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return ERR_PTR(err);
}
@@ -468,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+ int __percpu *busy_counter)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map_bucket *b;
@@ -497,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ __this_cpu_inc(*busy_counter);
+ }
bpf_selem_unlink(selem);
+ if (busy_counter) {
+ __this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
cond_resched_rcu();
}
rcu_read_unlock();
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 0ff58259ccf8..5efb2b24012c 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -67,7 +67,7 @@ BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
-const static struct bpf_func_proto bpf_bprm_opts_set_proto = {
+static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
.func = bpf_bprm_opts_set,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -88,7 +88,7 @@ static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
-const static struct bpf_func_proto bpf_ima_inode_hash_proto = {
+static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
- case BPF_FUNC_task_storage_get:
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- return &bpf_task_storage_delete_proto;
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e0da0258b732..3ce75758d394 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -15,21 +15,41 @@
#include <linux/bpf_local_storage.h>
#include <linux/filter.h>
#include <uapi/linux/btf.h>
-#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
+static DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+ migrate_disable();
+ __this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+ __this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ __this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
{
struct task_struct *task = owner;
- struct bpf_storage_blob *bsb;
- bsb = bpf_task(task);
- if (!bsb)
- return NULL;
- return &bsb->storage;
+ return &task->bpf_storage;
}
static struct bpf_local_storage_data *
@@ -38,13 +58,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
{
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
- struct bpf_storage_blob *bsb;
-
- bsb = bpf_task(task);
- if (!bsb)
- return NULL;
- task_storage = rcu_dereference(bsb->storage);
+ task_storage = rcu_dereference(task->bpf_storage);
if (!task_storage)
return NULL;
@@ -57,16 +72,12 @@ void bpf_task_storage_free(struct task_struct *task)
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_task_storage = false;
- struct bpf_storage_blob *bsb;
struct hlist_node *n;
-
- bsb = bpf_task(task);
- if (!bsb)
- return;
+ unsigned long flags;
rcu_read_lock();
- local_storage = rcu_dereference(bsb->storage);
+ local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage) {
rcu_read_unlock();
return;
@@ -81,7 +92,8 @@ void bpf_task_storage_free(struct task_struct *task)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
- raw_spin_lock_bh(&local_storage->lock);
+ bpf_task_storage_lock();
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
@@ -90,7 +102,8 @@ void bpf_task_storage_free(struct task_struct *task)
free_task_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, false);
}
- raw_spin_unlock_bh(&local_storage->lock);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_task_storage_unlock();
rcu_read_unlock();
/* free_task_storage should always be true as long as
@@ -123,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
goto out;
}
+ bpf_task_storage_lock();
sdata = task_storage_lookup(task, map, true);
+ bpf_task_storage_unlock();
put_pid(pid);
return sdata ? sdata->data : NULL;
out:
@@ -150,13 +165,15 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
*/
WARN_ON_ONCE(!rcu_read_lock_held());
task = pid_task(pid, PIDTYPE_PID);
- if (!task || !task_storage_ptr(task)) {
+ if (!task) {
err = -ENOENT;
goto out;
}
+ bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags);
+ bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -199,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
goto out;
}
+ bpf_task_storage_lock();
err = task_storage_delete(task, map);
+ bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
@@ -213,44 +232,47 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
- /* explicitly check that the task_storage_ptr is not
- * NULL as task_storage_lookup returns NULL in this case and
- * bpf_local_storage_update expects the owner to have a
- * valid storage pointer.
- */
- if (!task || !task_storage_ptr(task))
+ if (!task)
+ return (unsigned long)NULL;
+
+ if (!bpf_task_storage_trylock())
return (unsigned long)NULL;
sdata = task_storage_lookup(task, map, true);
if (sdata)
- return (unsigned long)sdata->data;
+ goto unlock;
- /* This helper must only be called from places where the lifetime of the task
- * is guaranteed. Either by being refcounted or by being protected
- * by an RCU read-side critical section.
- */
- if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+ /* only allocate new storage, when the task is refcounted */
+ if (refcount_read(&task->usage) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST);
- return IS_ERR(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
- }
- return (unsigned long)NULL;
+unlock:
+ bpf_task_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
}
BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
task)
{
+ int ret;
+
if (!task)
return -EINVAL;
+ if (!bpf_task_storage_trylock())
+ return -EBUSY;
+
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- return task_storage_delete(task, map);
+ ret = task_storage_delete(task, map);
+ bpf_task_storage_unlock();
+ return ret;
}
static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -276,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap);
+ bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
}
static int task_storage_map_btf_id;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b1a76fe046cb..0600ed325fa0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -173,7 +173,7 @@
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x8f00ffff
+#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -280,9 +280,10 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
+ [BTF_KIND_FLOAT] = "FLOAT",
};
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
{
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
@@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
+ case BTF_KIND_FLOAT:
return true;
}
@@ -787,7 +789,6 @@ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
while (btf_type_is_modifier(t) &&
BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
- id = t->type;
t = btf_type_by_id(btf, t->type);
}
@@ -1704,6 +1705,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
+ case BTF_KIND_FLOAT:
size = type->size;
goto resolved;
@@ -1849,7 +1851,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env,
return -EINVAL;
}
-/* Used for ptr, array and struct/union type members.
+/* Used for ptr, array struct/union and float type members.
* int, enum and modifier types have their specific callback functions.
*/
static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
@@ -3675,6 +3677,81 @@ static const struct btf_kind_operations datasec_ops = {
.show = btf_datasec_show,
};
+static s32 btf_float_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
+ t->size != 16) {
+ btf_verifier_log_type(env, t, "Invalid type_size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_float_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u64 start_offset_bytes;
+ u64 end_offset_bytes;
+ u64 misalign_bits;
+ u64 align_bytes;
+ u64 align_bits;
+
+ /* Different architectures have different alignment requirements, so
+ * here we check only for the reasonable minimum. This way we ensure
+ * that types after CO-RE can pass the kernel BTF verifier.
+ */
+ align_bytes = min_t(u64, sizeof(void *), member_type->size);
+ align_bits = align_bytes * BITS_PER_BYTE;
+ div64_u64_rem(member->offset, align_bits, &misalign_bits);
+ if (misalign_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not properly aligned");
+ return -EINVAL;
+ }
+
+ start_offset_bytes = member->offset / BITS_PER_BYTE;
+ end_offset_bytes = start_offset_bytes + member_type->size;
+ if (end_offset_bytes > struct_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void btf_float_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u", t->size);
+}
+
+static const struct btf_kind_operations float_ops = {
+ .check_meta = btf_float_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_float_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_float_log,
+ .show = btf_df_show,
+};
+
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -3808,6 +3885,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
+ [BTF_KIND_FLOAT] = &float_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4298,7 +4376,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_LINK_TYPE
static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
{
@@ -4592,8 +4670,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
arg = off / 8;
args = (const struct btf_param *)(t + 1);
- /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
- nr_args = t ? btf_type_vlen(t) : 5;
+ /* if (t == NULL) Fall back to default BPF prog with
+ * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+ */
+ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
if (prog->aux->attach_btf_trace) {
/* skip first 'void *__data' argument in btf_trace_##name typedef */
args++;
@@ -4649,7 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
} else {
if (!t)
- /* Default prog with 5 args */
+ /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
return true;
t = btf_type_by_id(btf, args[arg].type);
}
@@ -5100,12 +5180,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
if (!func) {
/* BTF function prototype doesn't match the verifier types.
- * Fall back to 5 u64 args.
+ * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < 5; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
m->arg_size[i] = 8;
m->ret_size = 8;
- m->nr_args = 5;
+ m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
@@ -5281,121 +5361,190 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs,
+ bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- const struct btf_param *args;
+ const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
- u32 i, nargs, btf_id, type_size;
- const char *tname;
- bool is_global;
-
- if (!prog->aux->func_info)
- return -EINVAL;
-
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
-
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
+ const struct btf_param *args;
+ u32 i, nargs, ref_id;
- t = btf_type_by_id(btf, btf_id);
+ t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
/* These checks were already done by the verifier while loading
- * struct bpf_func_info
+ * struct bpf_func_info or in add_kfunc_call().
*/
- bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
- subprog);
+ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+ func_id);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
+ func_name = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", tname);
+ bpf_log(log, "Invalid BTF of func %s\n", func_name);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
- goto out;
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
}
- is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
- struct bpf_reg_state *reg = &regs[i + 1];
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
- t = btf_type_by_id(btf, args[i].type);
- while (btf_type_is_modifier(t))
- t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ if (btf_type_is_scalar(t)) {
if (reg->type == SCALAR_VALUE)
continue;
- bpf_log(log, "R%d is not a scalar\n", i + 1);
- goto out;
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
}
- if (btf_type_is_ptr(t)) {
+
+ if (!btf_type_is_ptr(t)) {
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ if (btf_is_kernel(btf)) {
+ const struct btf_type *reg_ref_t;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (!btf_type_is_struct(ref_t)) {
+ bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ func_name, i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (reg->type == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else if (reg2btf_ids[reg->type]) {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[reg->type];
+ } else {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname, regno);
+ return -EINVAL;
+ }
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+ &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf,
+ reg_ref_t->name_off);
+ if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+ reg->off, btf, ref_id)) {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname,
+ regno, btf_type_str(reg_ref_t),
+ reg_ref_tname);
+ return -EINVAL;
+ }
+ } else if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
- if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
- }
- if (check_ctx_reg(env, reg, i + 1))
- goto out;
- continue;
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
}
+ if (check_ctx_reg(env, reg, regno))
+ return -EINVAL;
+ } else if (ptr_to_mem_ok) {
+ const struct btf_type *resolve_ret;
+ u32 type_size;
- if (!is_global)
- goto out;
-
- t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- ref_t = btf_resolve_size(btf, t, &type_size);
- if (IS_ERR(ref_t)) {
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
- PTR_ERR(ref_t));
- goto out;
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname,
+ PTR_ERR(resolve_ret));
+ return -EINVAL;
}
- if (check_mem_reg(env, reg, i + 1, type_size))
- goto out;
-
- continue;
+ if (check_mem_reg(env, reg, regno, type_size))
+ return -EINVAL;
+ } else {
+ return -EINVAL;
}
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
}
+
return 0;
-out:
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
* In such cases mark the function as unreliable from BTF point of view.
*/
- prog->aux->func_info_aux[subprog].unreliable = true;
- return -EINVAL;
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs)
+{
+ return btf_check_func_arg_match(env, btf, func_id, regs, false);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -5458,9 +5607,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
- tname, nargs);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+ tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
/* check that function returns int */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75244ecb2389..5e31ee9f7512 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -143,25 +143,25 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
if (!prog->aux->nr_linfo || !prog->jit_requested)
return 0;
- prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
- sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!prog->aux->jited_linfo)
return -ENOMEM;
return 0;
}
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
- kfree(prog->aux->jited_linfo);
- prog->aux->jited_linfo = NULL;
-}
+ if (prog->aux->jited_linfo &&
+ (!prog->jited || !prog->aux->jited_linfo[0])) {
+ kvfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+ }
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
- if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
- bpf_prog_free_jited_linfo(prog);
+ kfree(prog->aux->kfunc_tab);
+ prog->aux->kfunc_tab = NULL;
}
/* The jit engine is responsible to provide an array
@@ -217,12 +217,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
- bpf_prog_free_jited_linfo(prog);
- kvfree(prog->aux->linfo);
-}
-
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
@@ -1369,11 +1363,10 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
* __bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
- * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
*/
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
@@ -1707,7 +1700,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
@@ -1724,7 +1717,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
BPF_R3 = r3; \
BPF_R4 = r4; \
BPF_R5 = r5; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define EVAL1(FN, X) FN(X)
@@ -1849,9 +1842,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
+ bool jit_needed = false;
+
if (fp->bpf_func)
goto finalize;
+ if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+ bpf_prog_has_kfunc_call(fp))
+ jit_needed = true;
+
bpf_prog_select_func(fp);
/* eBPF JITs can rewrite the program in case constant
@@ -1866,14 +1865,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
return fp;
fp = bpf_int_jit_compile(fp);
- if (!fp->jited) {
- bpf_prog_free_jited_linfo(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ bpf_prog_jit_attempt_done(fp);
+ if (!fp->jited && jit_needed) {
*err = -ENOTSUPP;
return fp;
-#endif
- } else {
- bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
@@ -2354,6 +2349,11 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5d1469de6921..5dd3e866599a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -27,7 +27,7 @@
#include <linux/capability.h>
#include <trace/events/xdp.h>
-#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/netdevice.h> /* netif_receive_skb_list */
#include <linux/etherdevice.h> /* eth_type_trans */
/* General idea: XDP packets getting XDP redirected to another CPU,
@@ -252,11 +252,12 @@ static int cpu_map_kthread_run(void *data)
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
struct xdp_cpumap_stats stats = {}; /* zero stats */
+ unsigned int kmem_alloc_drops = 0, sched = 0;
gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- unsigned int drops = 0, sched = 0;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
int i, n, m, nframes;
+ LIST_HEAD(list);
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -297,7 +298,7 @@ static int cpu_map_kthread_run(void *data)
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
- drops += nframes;
+ kmem_alloc_drops += nframes;
}
}
@@ -305,7 +306,6 @@ static int cpu_map_kthread_run(void *data)
for (i = 0; i < nframes; i++) {
struct xdp_frame *xdpf = frames[i];
struct sk_buff *skb = skbs[i];
- int ret;
skb = __xdp_build_skb_from_frame(xdpf, skb,
xdpf->dev_rx);
@@ -314,13 +314,13 @@ static int cpu_map_kthread_run(void *data)
continue;
}
- /* Inject into network stack */
- ret = netif_receive_skb_core(skb);
- if (ret == NET_RX_DROP)
- drops++;
+ list_add_tail(&skb->list, &list);
}
+ netif_receive_skb_list(&list);
+
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
+ sched, &stats);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map)
* complete.
*/
- bpf_clear_redirect_map(map);
synchronize_rcu();
/* For cpu_map the remote CPUs can still be using the entries
@@ -563,7 +562,7 @@ static void cpu_map_free(struct bpf_map *map)
kfree(cmap);
}
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpu_map_entry *rcpu;
@@ -600,6 +599,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+}
+
static int cpu_map_btf_id;
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -612,6 +616,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_cpu_map",
.map_btf_id = &cpu_map_btf_id,
+ .map_redirect = cpu_map_redirect,
};
static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 85d9d1b72a33..aa516472ce46 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -258,7 +257,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct hlist_head *head = dev_map_index_hash(dtab, key);
@@ -330,7 +329,7 @@ bool dev_map_can_have_prog(struct bpf_map *map)
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
- int sent = 0, drops = 0, err = 0;
+ int sent = 0, err = 0;
int i;
if (unlikely(!bq->count))
@@ -344,29 +343,23 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
if (sent < 0) {
+ /* If ndo_xdp_xmit fails with an errno, no frames have
+ * been xmit'ed.
+ */
err = sent;
sent = 0;
- goto error;
}
- drops = bq->count - sent;
-out:
- bq->count = 0;
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
- bq->dev_rx = NULL;
- __list_del_clearprev(&bq->flush_node);
- return;
-error:
- /* If ndo_xdp_xmit fails with an errno, no frames have been
- * xmit'ed and it's our responsibility to them free all.
+ /* If not all frames have been transmitted, it is our
+ * responsibility to free them
*/
- for (i = 0; i < bq->count; i++) {
- struct xdp_frame *xdpf = bq->q[i];
+ for (i = sent; unlikely(i < bq->count); i++)
+ xdp_return_frame_rx_napi(bq->q[i]);
- xdp_return_frame_rx_napi(xdpf);
- drops++;
- }
- goto out;
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
+ bq->dev_rx = NULL;
+ bq->count = 0;
+ __list_del_clearprev(&bq->flush_node);
}
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -392,7 +385,7 @@ void __dev_flush(void)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *obj;
@@ -735,6 +728,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
+static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+}
+
+static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+}
+
static int dev_map_btf_id;
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -747,6 +750,7 @@ const struct bpf_map_ops dev_map_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_dtab",
.map_btf_id = &dev_map_btf_id,
+ .map_redirect = dev_map_redirect,
};
static int dev_map_hash_map_btf_id;
@@ -761,6 +765,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_dtab",
.map_btf_id = &dev_map_hash_map_btf_id,
+ .map_redirect = dev_hash_map_redirect,
};
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index faa54d58972c..bbfc6bb79240 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
{
BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
- if (insn->src_reg != BPF_PSEUDO_CALL &&
+ if (!insn->src_reg &&
insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
func_id_str[insn->imm])
return func_id_str[insn->imm];
- if (cbs && cbs->cb_call)
- return cbs->cb_call(cbs->private_data, insn);
+ if (cbs && cbs->cb_call) {
+ const char *res;
+
+ res = cbs->cb_call(cbs->private_data, insn);
+ if (res)
+ return res;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
snprintf(buff, len, "%+d", insn->imm);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ snprintf(buff, len, "kernel-function");
return buff;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d63912e73ad9..d7ebb12ffffc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -31,7 +31,7 @@
/*
* The bucket lock has two protection scopes:
*
- * 1) Serializing concurrent operations from BPF programs on differrent
+ * 1) Serializing concurrent operations from BPF programs on different
* CPUs
*
* 2) Serializing concurrent operations from BPF programs and sys_bpf()
@@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
+static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ u32 roundup_key_size;
+ int i, num_elems = 0;
+ void __percpu *pptr;
+ struct bucket *b;
+ void *key, *val;
+ bool is_percpu;
+ u64 ret = 0;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = htab_is_percpu(htab);
+
+ roundup_key_size = round_up(map->key_size, 8);
+ /* disable migration so percpu value prepared here will be the
+ * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ */
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ key = elem->key;
+ if (is_percpu) {
+ /* current cpu value for percpu map */
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ val = this_cpu_ptr(pptr);
+ } else {
+ val = elem->key + roundup_key_size;
+ }
+ num_elems++;
+ ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+ (u64)(long)key, (u64)(long)val,
+ (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret) {
+ rcu_read_unlock();
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
static int htab_map_btf_id;
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_map_btf_id,
@@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_lru),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_lru_map_btf_id,
@@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_percpu),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_percpu_map_btf_id,
@@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
BATCH_OPS(htab_lru_percpu),
.map_btf_name = "bpf_htab",
.map_btf_id = &htab_lru_percpu_map_btf_id,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 308427fe03a3..544773970dbc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
};
#ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
@@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
* verifier checks that its value is correct.
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
+ struct bpf_cgroup_storage *storage = NULL;
void *ptr;
+ int i;
- storage = this_cpu_read(bpf_cgroup_storage[stype]);
+ for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+ if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ continue;
+
+ storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+ break;
+ }
if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
@@ -662,6 +669,322 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
+static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE)
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ fallthrough;
+#endif
+ case 'k':
+ return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ case 'u':
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ }
+
+ return -EINVAL;
+}
+
+/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p
+ */
+#define MAX_PRINTF_BUF_LEN 512
+
+struct bpf_printf_buf {
+ char tmp_buf[MAX_PRINTF_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf);
+static DEFINE_PER_CPU(int, bpf_printf_buf_used);
+
+static int try_get_fmt_tmp_buf(char **tmp_buf)
+{
+ struct bpf_printf_buf *bufs;
+ int used;
+
+ preempt_disable();
+ used = this_cpu_inc_return(bpf_printf_buf_used);
+ if (WARN_ON_ONCE(used > 1)) {
+ this_cpu_dec(bpf_printf_buf_used);
+ preempt_enable();
+ return -EBUSY;
+ }
+ bufs = this_cpu_ptr(&bpf_printf_buf);
+ *tmp_buf = bufs->tmp_buf;
+
+ return 0;
+}
+
+void bpf_bprintf_cleanup(void)
+{
+ if (this_cpu_read(bpf_printf_buf_used)) {
+ this_cpu_dec(bpf_printf_buf_used);
+ preempt_enable();
+ }
+}
+
+/*
+ * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
+ *
+ * Returns a negative value if fmt is an invalid format string or 0 otherwise.
+ *
+ * This can be used in two ways:
+ * - Format string verification only: when bin_args is NULL
+ * - Arguments preparation: in addition to the above verification, it writes in
+ * bin_args a binary representation of arguments usable by bstr_printf where
+ * pointers from BPF have been sanitized.
+ *
+ * In argument preparation mode, if 0 is returned, safe temporary buffers are
+ * allocated and bpf_bprintf_cleanup should be called to free them after use.
+ */
+int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+ u32 **bin_args, u32 num_args)
+{
+ char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ size_t sizeof_cur_arg, sizeof_cur_ip;
+ int err, i, num_spec = 0;
+ u64 cur_arg;
+ char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
+
+ fmt_end = strnchr(fmt, fmt_size, 0);
+ if (!fmt_end)
+ return -EINVAL;
+ fmt_size = fmt_end - fmt;
+
+ if (bin_args) {
+ if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
+ return -EBUSY;
+
+ tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN;
+ *bin_args = (u32 *)tmp_buf;
+ }
+
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (num_spec >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The string is zero-terminated so if fmt[i] != 0, we can
+ * always access fmt[i + 1], in the worst case it will be a 0
+ */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formatting field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 'p') {
+ sizeof_cur_arg = sizeof(long);
+
+ if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
+ fmt[i + 2] == 's') {
+ fmt_ptype = fmt[i + 1];
+ i += 2;
+ goto fmt_str;
+ }
+
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
+ fmt[i + 1] == 'S') {
+ /* just kernel pointers */
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ i++;
+ goto nocopy_fmt;
+ }
+
+ if (fmt[i + 1] == 'B') {
+ if (tmp_buf) {
+ err = snprintf(tmp_buf,
+ (tmp_buf_end - tmp_buf),
+ "%pB",
+ (void *)(long)raw_args[num_spec]);
+ tmp_buf += (err + 1);
+ }
+
+ i++;
+ num_spec++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
+ (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ i += 2;
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
+ if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
+ sizeof_cur_ip);
+ if (err < 0)
+ memset(cur_ip, 0, sizeof_cur_ip);
+
+ /* hack: bstr_printf expects IP addresses to be
+ * pre-formatted as strings, ironically, the easiest way
+ * to do that is to call snprintf.
+ */
+ ip_spec[2] = fmt[i - 1];
+ ip_spec[3] = fmt[i];
+ err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
+ ip_spec, &cur_ip);
+
+ tmp_buf += err + 1;
+ num_spec++;
+
+ continue;
+ } else if (fmt[i] == 's') {
+ fmt_ptype = fmt[i];
+fmt_str:
+ if (fmt[i + 1] != 0 &&
+ !isspace(fmt[i + 1]) &&
+ !ispunct(fmt[i + 1])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
+ fmt_ptype,
+ tmp_buf_end - tmp_buf);
+ if (err < 0) {
+ tmp_buf[0] = '\0';
+ err = 1;
+ }
+
+ tmp_buf += err;
+ num_spec++;
+
+ continue;
+ }
+
+ sizeof_cur_arg = sizeof(int);
+
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long);
+ i++;
+ }
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long long);
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
+ fmt[i] != 'x' && fmt[i] != 'X') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+nocopy_fmt:
+ if (tmp_buf) {
+ tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
+ if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (sizeof_cur_arg == 8) {
+ *(u32 *)tmp_buf = *(u32 *)&cur_arg;
+ *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
+ } else {
+ *(u32 *)tmp_buf = (u32)(long)cur_arg;
+ }
+ tmp_buf += sizeof_cur_arg;
+ }
+ num_spec++;
+ }
+
+ err = 0;
+out:
+ if (err)
+ bpf_bprintf_cleanup();
+ return err;
+}
+
+#define MAX_SNPRINTF_VARARGS 12
+
+BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
+ const void *, data, u32, data_len)
+{
+ int err, num_args;
+ u32 *bin_args;
+
+ if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
+ * can safely give an unbounded size.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
+ if (err < 0)
+ return err;
+
+ err = bstr_printf(str, str_size, fmt, bin_args);
+
+ bpf_bprintf_cleanup();
+
+ return err + 1;
+}
+
+const struct bpf_func_proto bpf_snprintf_proto = {
+ .func = bpf_snprintf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+ .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -708,6 +1031,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
default:
break;
}
@@ -748,6 +1073,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
default:
return NULL;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index d2de2abec35b..b4ebd60a6c16 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -816,8 +816,6 @@ static int __init bpf_init(void)
{
int ret;
- mutex_init(&bpf_preload_lock);
-
ret = sysfs_create_mount_point(fs_kobj, "bpf");
if (ret)
return ret;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2d4f9ac12377..bd11db9774c3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@
#include <linux/slab.h>
#include <uapi/linux/btf.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
#ifdef CONFIG_CGROUP_BPF
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+ bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
#include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index cec792a17e5f..1b7b8a6f34ee 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -726,6 +726,9 @@ const struct bpf_map_ops trie_map_ops = {
.map_lookup_elem = trie_lookup_elem,
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
.map_btf_name = "lpm_trie",
.map_btf_id = &trie_map_btf_id,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 250503482cda..941ca06d9dfa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1694,7 +1694,9 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
- bpf_prog_free_linfo(prog);
+ kvfree(prog->aux->jited_linfo);
+ kvfree(prog->aux->linfo);
+ kfree(prog->aux->kfunc_tab);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
@@ -2549,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
container_of(link, struct bpf_tracing_link, link);
info->tracing.attach_type = tr_link->attach_type;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &info->tracing.target_obj_id,
+ &info->tracing.target_btf_id);
return 0;
}
@@ -2643,14 +2648,25 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* target_btf_id using the link_create API.
*
* - if tgt_prog == NULL when this function was called using the old
- * raw_tracepoint_open API, and we need a target from prog->aux
- *
- * The combination of no saved target in prog->aux, and no target
- * specified on load is illegal, and we reject that here.
+ * raw_tracepoint_open API, and we need a target from prog->aux
+ *
+ * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
+ * was detached and is going for re-attachment.
*/
if (!prog->aux->dst_trampoline && !tgt_prog) {
- err = -ENOENT;
- goto out_unlock;
+ /*
+ * Allow re-attach for TRACING and LSM programs. If it's
+ * currently linked, bpf_trampoline_link_prog will fail.
+ * EXT programs need to specify tgt_prog_fd, so they
+ * re-attach in separate code path.
+ */
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ btf_id = prog->aux->attach_btf_id;
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
}
if (!prog->aux->dst_trampoline ||
@@ -2946,6 +2962,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_MSG;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_SKB_VERDICT:
return BPF_PROG_TYPE_SK_SKB;
case BPF_LIRC_MODE2:
return BPF_PROG_TYPE_LIRC_MODE2;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4aa8b52adf25..2d44b5aa0057 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -444,7 +444,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
tr->progs_cnt[kind]++;
err = bpf_trampoline_update(tr);
if (err) {
- hlist_del(&prog->aux->tramp_hlist);
+ hlist_del_init(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
}
out:
@@ -467,7 +467,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
tr->extension_prog = NULL;
goto out;
}
- hlist_del(&prog->aux->tramp_hlist);
+ hlist_del_init(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
err = bpf_trampoline_update(tr);
out:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0399ac092b36..757476c91c98 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,18 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_CALL;
}
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -248,6 +260,7 @@ struct bpf_call_arg_meta {
u32 btf_id;
struct btf *ret_btf;
u32 ret_btf_id;
+ u32 subprogno;
};
struct btf *btf_vmlinux;
@@ -390,6 +403,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
env->prev_linfo = linfo;
}
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct tnum *range, const char *ctx,
+ const char *reg_name)
+{
+ char tn_buf[48];
+
+ verbose(env, "At %s the register %s ", ctx, reg_name);
+ if (!tnum_is_unknown(reg->var_off)) {
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ tnum_strn(tn_buf, sizeof(tn_buf), *range);
+ verbose(env, " should have been in %s\n", tn_buf);
+}
+
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_PACKET ||
@@ -409,6 +440,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON;
}
@@ -451,7 +483,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type)
type == ARG_PTR_TO_MEM_OR_NULL ||
type == ARG_PTR_TO_CTX_OR_NULL ||
type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+ type == ARG_PTR_TO_STACK_OR_NULL;
}
/* Determine whether the function releases some resources allocated by another
@@ -541,6 +574,8 @@ static const char * const reg_type_str[] = {
[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
[PTR_TO_RDWR_BUF] = "rdwr_buf",
[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
};
static char slot_type_char[] = {
@@ -612,6 +647,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (type_is_pkt_pointer(t))
verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
+ t == PTR_TO_MAP_KEY ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
verbose(env, ",ks=%d,vs=%d",
@@ -1362,9 +1398,7 @@ static bool __reg64_bound_s32(s64 a)
static bool __reg64_bound_u32(u64 a)
{
- if (a > U32_MIN && a < U32_MAX)
- return true;
- return false;
+ return a > U32_MIN && a < U32_MAX;
}
static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
@@ -1375,10 +1409,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
reg->s32_min_value = (s32)reg->smin_value;
reg->s32_max_value = (s32)reg->smax_value;
}
- if (__reg64_bound_u32(reg->umin_value))
+ if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
reg->u32_min_value = (u32)reg->umin_value;
- if (__reg64_bound_u32(reg->umax_value))
reg->u32_max_value = (u32)reg->umax_value;
+ }
/* Intersecting with the old var_off might have improved our bounds
* slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
@@ -1519,39 +1553,210 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
}
ret = find_subprog(env, off);
if (ret >= 0)
- return 0;
+ return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
verbose(env, "too many subprograms\n");
return -E2BIG;
}
+ /* determine subprog starts. The end is one before the next starts */
env->subprog_info[env->subprog_cnt++].start = off;
sort(env->subprog_info, env->subprog_cnt,
sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
+ return env->subprog_cnt - 1;
+}
+
+struct bpf_kfunc_desc {
+ struct btf_func_model func_model;
+ u32 func_id;
+ s32 imm;
+};
+
+#define MAX_KFUNC_DESCS 256
+struct bpf_kfunc_desc_tab {
+ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+ u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ /* func_id is not greater than BTF_MAX_TYPE */
+ return d0->func_id - d1->func_id;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+{
+ struct bpf_kfunc_desc desc = {
+ .func_id = func_id,
+ };
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ return bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+{
+ const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_desc_tab *tab;
+ struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_desc *desc;
+ const char *func_name;
+ unsigned long addr;
+ int err;
+
+ prog_aux = env->prog->aux;
+ tab = prog_aux->kfunc_tab;
+ if (!tab) {
+ if (!btf_vmlinux) {
+ verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required for calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!bpf_jit_supports_kfunc_call()) {
+ verbose(env, "JIT does not support calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->gpl_compatible) {
+ verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+ return -EINVAL;
+ }
+
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ prog_aux->kfunc_tab = tab;
+ }
+
+ if (find_kfunc_desc(env->prog, func_id))
+ return 0;
+
+ if (tab->nr_descs == MAX_KFUNC_DESCS) {
+ verbose(env, "too many different kernel function calls\n");
+ return -E2BIG;
+ }
+
+ func = btf_type_by_id(btf_vmlinux, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %u is not a function\n",
+ func_id);
+ return -EINVAL;
+ }
+ func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ addr = kallsyms_lookup_name(func_name);
+ if (!addr) {
+ verbose(env, "cannot find address for kernel function %s\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ desc = &tab->descs[tab->nr_descs++];
+ desc->func_id = func_id;
+ desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
+ err = btf_distill_func_proto(&env->log, btf_vmlinux,
+ func_proto, func_name,
+ &desc->func_model);
+ if (!err)
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id, NULL);
+ return err;
+}
+
+static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm > d1->imm)
+ return 1;
+ else if (d0->imm < d1->imm)
+ return -1;
return 0;
}
-static int check_subprogs(struct bpf_verifier_env *env)
+static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+{
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ if (!tab)
+ return;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+ return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
- int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
+ int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
- if (ret < 0)
+ if (ret)
return ret;
- /* determine subprog starts. The end is one before the next starts */
- for (i = 0; i < insn_cnt; i++) {
- if (!bpf_pseudo_call(insn + i))
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+ !bpf_pseudo_kfunc_call(insn))
continue;
+
if (!env->bpf_capable) {
- verbose(env,
- "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
- ret = add_subprog(env, i + insn[i].imm + 1);
+
+ if (bpf_pseudo_func(insn)) {
+ ret = add_subprog(env, i + insn->imm + 1);
+ if (ret >= 0)
+ /* remember subprog */
+ insn[1].imm = ret;
+ } else if (bpf_pseudo_call(insn)) {
+ ret = add_subprog(env, i + insn->imm + 1);
+ } else {
+ ret = add_kfunc_call(env, insn->imm);
+ }
+
if (ret < 0)
return ret;
}
@@ -1565,6 +1770,16 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
+ return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+ int i, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+
/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
@@ -1873,6 +2088,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
return i;
}
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+ const struct btf_type *func;
+
+ if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+ return NULL;
+
+ func = btf_type_by_id(btf_vmlinux, insn->imm);
+ return btf_name_by_offset(btf_vmlinux, func->name_off);
+}
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -1881,6 +2107,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
u32 *reg_mask, u64 *stack_mask)
{
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -2295,6 +2522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
case PTR_TO_MEM_OR_NULL:
+ case PTR_TO_FUNC:
+ case PTR_TO_MAP_KEY:
return true;
default:
return false;
@@ -2899,6 +3128,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
reg = &cur_regs(env)[regno];
switch (reg->type) {
+ case PTR_TO_MAP_KEY:
+ verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
mem_size, off, size);
@@ -3304,6 +3537,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_FLOW_KEYS:
pointer_desc = "flow keys ";
break;
+ case PTR_TO_MAP_KEY:
+ pointer_desc = "key ";
+ break;
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -3405,7 +3641,7 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- if (!bpf_pseudo_call(insn + i))
+ if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
@@ -3842,7 +4078,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
/* for access checks, reg->off is just part of off */
off += reg->off;
- if (reg->type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_KEY) {
+ if (t == BPF_WRITE) {
+ verbose(env, "write to change key R%d not allowed\n", regno);
+ return -EACCES;
+ }
+
+ err = check_mem_region_access(env, regno, off, size,
+ reg->map_ptr->key_size, false);
+ if (err)
+ return err;
+ if (value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4258,6 +4506,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_MAP_KEY:
+ return check_mem_region_access(env, regno, reg->off, access_size,
+ reg->map_ptr->key_size, false);
case PTR_TO_MAP_VALUE:
if (check_map_access_type(env, regno, reg->off, access_size,
meta && meta->raw_mode ? BPF_WRITE :
@@ -4474,6 +4725,7 @@ static const struct bpf_reg_types map_key_value_types = {
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
},
};
@@ -4505,6 +4757,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
PTR_TO_RDONLY_BUF,
@@ -4517,6 +4770,7 @@ static const struct bpf_reg_types int_ptr_types = {
PTR_TO_STACK,
PTR_TO_PACKET,
PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
},
};
@@ -4529,6 +4783,9 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
+static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -4557,6 +4814,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
+ [ARG_PTR_TO_FUNC] = &func_ptr_types,
+ [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
+ [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4738,6 +4998,8 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
+ } else if (arg_type == ARG_PTR_TO_FUNC) {
+ meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
@@ -4805,6 +5067,44 @@ skip_type_check:
if (err)
return err;
err = check_ptr_alignment(env, reg, 0, size, true);
+ } else if (arg_type == ARG_PTR_TO_CONST_STR) {
+ struct bpf_map *map = reg->map_ptr;
+ int map_off;
+ u64 map_addr;
+ char *str_ptr;
+
+ if (!bpf_map_is_rdonly(map)) {
+ verbose(env, "R%d does not point to a readonly map'\n", regno);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a constant address'\n", regno);
+ return -EACCES;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EACCES;
+ }
+
+ err = check_map_access(env, regno, reg->off,
+ map->value_size - reg->off, false);
+ if (err)
+ return err;
+
+ map_off = reg->off + reg->var_off.value;
+ err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+ if (err) {
+ verbose(env, "direct value access on string failed\n");
+ return err;
+ }
+
+ str_ptr = (char *)(long)(map_addr);
+ if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+ verbose(env, "string is not zero-terminated\n");
+ return -EINVAL;
+ }
}
return err;
@@ -5258,13 +5558,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
}
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
+typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx);
+
+static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
{
struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
- int i, err, subprog, target_insn;
+ int err;
bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
@@ -5273,14 +5579,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -E2BIG;
}
- target_insn = *insn_idx + insn->imm;
- subprog = find_subprog(env, target_insn + 1);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn + 1);
- return -EFAULT;
- }
-
caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
@@ -5291,7 +5589,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_arg_match(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -5335,11 +5633,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (err)
return err;
- /* copy r1 - r5 args that callee can access. The copy includes parent
- * pointers, which connects us up to the liveness chain
- */
- for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- callee->regs[i] = caller->regs[i];
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
clear_caller_saved_regs(env, caller->regs);
@@ -5347,7 +5643,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
state->curframe++;
/* and go analyze first insn of the callee */
- *insn_idx = target_insn;
+ *insn_idx = env->subprog_info[subprog].start - 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
@@ -5358,6 +5654,92 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee)
+{
+ /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+ * void *callback_ctx, u64 flags);
+ * callback_fn(struct bpf_map *map, void *key, void *value,
+ * void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ return 0;
+}
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx)
+{
+ int i;
+
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ callee->regs[i] = caller->regs[i];
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ int subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n",
+ target_insn);
+ return -EFAULT;
+ }
+
+ return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
+}
+
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map;
+ int err;
+
+ if (bpf_map_ptr_poisoned(insn_aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ if (!map->ops->map_set_for_each_callback_args ||
+ !map->ops->map_for_each_callback) {
+ verbose(env, "callback function not allowed for map\n");
+ return -ENOTSUPP;
+ }
+
+ err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+ if (err)
+ return err;
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -5380,8 +5762,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
state->curframe--;
caller = state->frame[state->curframe];
- /* return to the caller whatever r0 had in the callee */
- caller->regs[BPF_REG_0] = *r0;
+ if (callee->in_callback_fn) {
+ /* enforce R0 return value range [0, 1]. */
+ struct tnum range = tnum_range(0, 1);
+
+ if (r0->type != SCALAR_VALUE) {
+ verbose(env, "R0 not a scalar value\n");
+ return -EACCES;
+ }
+ if (!tnum_in(range, r0->var_off)) {
+ verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+ return -EINVAL;
+ }
+ } else {
+ /* return to the caller whatever r0 had in the callee */
+ caller->regs[BPF_REG_0] = *r0;
+ }
/* Transfer references to the caller */
err = transfer_reference_state(caller, callee);
@@ -5409,6 +5805,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
if (ret_type != RET_INTEGER ||
(func_id != BPF_FUNC_get_stack &&
+ func_id != BPF_FUNC_get_task_stack &&
func_id != BPF_FUNC_probe_read_str &&
func_id != BPF_FUNC_probe_read_kernel_str &&
func_id != BPF_FUNC_probe_read_user_str))
@@ -5436,7 +5833,9 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_map_push_elem &&
func_id != BPF_FUNC_map_pop_elem &&
- func_id != BPF_FUNC_map_peek_elem)
+ func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_for_each_map_elem &&
+ func_id != BPF_FUNC_redirect_map)
return 0;
if (map == NULL) {
@@ -5517,15 +5916,55 @@ static int check_reference_leak(struct bpf_verifier_env *env)
return state->acquired_refs ? -EINVAL : 0;
}
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
+ struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
+ struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ int err, fmt_map_off, num_args;
+ u64 fmt_addr;
+ char *fmt;
+
+ /* data must be an array of u64 */
+ if (data_len_reg->var_off.value % 8)
+ return -EINVAL;
+ num_args = data_len_reg->var_off.value / 8;
+
+ /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
+ * and map_direct_value_addr is set.
+ */
+ fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+ err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
+ fmt_map_off);
+ if (err) {
+ verbose(env, "verifier bug\n");
+ return -EFAULT;
+ }
+ fmt = (char *)(long)fmt_addr + fmt_map_off;
+
+ /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
+ * can focus on validating the format specifiers.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
+ if (err < 0)
+ verbose(env, "Invalid format string\n");
+
+ return err;
+}
+
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
{
const struct bpf_func_proto *fn = NULL;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
+ int insn_idx = *insn_idx_p;
bool changes_data;
- int i, err;
+ int i, err, func_id;
/* find function prototype */
+ func_id = insn->imm;
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
func_id);
@@ -5571,7 +6010,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
meta.func_id = func_id;
/* check args */
- for (i = 0; i < 5; i++) {
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
err = check_func_arg(env, i, &meta, fn);
if (err)
return err;
@@ -5621,6 +6060,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
+ if (func_id == BPF_FUNC_for_each_map_elem) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_map_elem_callback_state);
+ if (err < 0)
+ return -EINVAL;
+ }
+
+ if (func_id == BPF_FUNC_snprintf) {
+ err = check_bpf_snprintf_call(env, regs);
+ if (err < 0)
+ return err;
+ }
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -5776,6 +6228,98 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return 0;
}
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+ if (regno == BPF_REG_0) {
+ /* Function return value */
+ reg->live |= REG_LIVE_WRITTEN;
+ reg->subreg_def = reg_size == sizeof(u64) ?
+ DEF_NOT_SUBREG : env->insn_idx + 1;
+ } else {
+ /* Function argument */
+ if (reg_size == sizeof(u64)) {
+ mark_insn_zext(env, reg);
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ } else {
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+ }
+ }
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ const struct btf_type *t, *func, *func_proto, *ptr_type;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ u32 i, nargs, func_id, ptr_type_id;
+ const struct btf_param *args;
+ int err;
+
+ func_id = insn->imm;
+ func = btf_type_by_id(btf_vmlinux, func_id);
+ func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ func_proto = btf_type_by_id(btf_vmlinux, func->type);
+
+ if (!env->ops->check_kfunc_call ||
+ !env->ops->check_kfunc_call(func_id)) {
+ verbose(env, "calling kernel function %s is not allowed\n",
+ func_name);
+ return -EACCES;
+ }
+
+ /* Check the arguments */
+ err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+ if (err)
+ return err;
+
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ /* Check return type */
+ t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+ if (btf_type_is_scalar(t)) {
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+ } else if (btf_type_is_ptr(t)) {
+ ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+ &ptr_type_id);
+ if (!btf_type_is_struct(ptr_type)) {
+ ptr_type_name = btf_name_by_offset(btf_vmlinux,
+ ptr_type->name_off);
+ verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name, btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+ mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+
+ nargs = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nargs; i++) {
+ u32 regno = i + 1;
+
+ t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+ if (btf_type_is_ptr(t))
+ mark_btf_func_reg_size(env, regno, sizeof(void *));
+ else
+ /* scalar. ensured by btf_check_kfunc_arg_match() */
+ mark_btf_func_reg_size(env, regno, t->size);
+ }
+
+ return 0;
+}
+
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -5920,7 +6464,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
aux->alu_limit != alu_limit))
return REASON_PATHS;
- /* Corresponding fixup done in fixup_bpf_calls(). */
+ /* Corresponding fixup done in do_misc_fixups(). */
aux->alu_state = alu_state;
aux->alu_limit = alu_limit;
return 0;
@@ -5952,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
{
struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
struct bpf_verifier_state *vstate = env->cur_state;
+ bool off_is_imm = tnum_is_const(off_reg->var_off);
bool off_is_neg = off_reg->smin_value < 0;
bool ptr_is_dst_reg = ptr_reg == dst_reg;
u8 opcode = BPF_OP(insn->code);
@@ -5982,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_limit = abs(tmp_aux->alu_limit - alu_limit);
} else {
alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
}
@@ -8345,6 +8891,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ u32 subprogno = insn[1].imm;
+
+ if (!aux->func_info) {
+ verbose(env, "missing btf func_info\n");
+ return -EINVAL;
+ }
+ if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+ verbose(env, "callback function not static\n");
+ return -EINVAL;
+ }
+
+ dst_reg->type = PTR_TO_FUNC;
+ dst_reg->subprogno = subprogno;
+ return 0;
+ }
+
map = env->used_maps[aux->map_index];
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
@@ -8573,17 +9137,7 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
- char tn_buf[48];
-
- verbose(env, "At program exit the register R0 ");
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
- }
- tnum_strn(tn_buf, sizeof(tn_buf), range);
- verbose(env, " should have been in %s\n", tn_buf);
+ verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
return -EINVAL;
}
@@ -8710,6 +9264,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return DONE_EXPLORING;
}
+static int visit_func_call_insn(int t, int insn_cnt,
+ struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret;
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+ if (ret)
+ return ret;
+
+ if (t + 1 < insn_cnt)
+ init_explored_state(env, t + 1);
+ if (visit_callee) {
+ init_explored_state(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+ env, false);
+ }
+ return ret;
+}
+
/* Visits the instruction at index t and returns one of the following:
* < 0 - an error occurred
* DONE_EXPLORING - the instruction was fully explored
@@ -8720,6 +9295,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
struct bpf_insn *insns = env->prog->insnsi;
int ret;
+ if (bpf_pseudo_func(insns + t))
+ return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
BPF_CLASS(insns[t].code) != BPF_JMP32)
@@ -8730,18 +9308,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
- if (ret)
- return ret;
-
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
- if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
- }
- return ret;
+ return visit_func_call_insn(t, insn_cnt, insns, env,
+ insns[t].src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
if (BPF_SRC(insns[t].code) != BPF_K)
@@ -9354,6 +9922,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
*/
return false;
}
+ case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
@@ -10037,6 +10606,7 @@ static int do_check(struct bpf_verifier_env *env)
if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
.cb_print = verbose,
.private_data = env,
};
@@ -10184,7 +10754,8 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
@@ -10199,11 +10770,12 @@ static int do_check(struct bpf_verifier_env *env)
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ err = check_kfunc_call(env, insn);
else
- err = check_helper_call(env, insn->imm, env->insn_idx);
+ err = check_helper_call(env, insn, &env->insn_idx);
if (err)
return err;
-
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
@@ -10632,6 +11204,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
goto next_insn;
}
+ if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+ aux = &env->insn_aux_data[i];
+ aux->ptr_type = PTR_TO_FUNC;
+ goto next_insn;
+ }
+
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
@@ -10764,9 +11342,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
int insn_cnt = env->prog->len;
int i;
- for (i = 0; i < insn_cnt; i++, insn++)
- if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
- insn->src_reg = 0;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+ if (insn->src_reg == BPF_PSEUDO_FUNC)
+ continue;
+ insn->src_reg = 0;
+ }
}
/* single env->prog->insni[off] instruction was replaced with the range
@@ -11405,6 +11987,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* subprog is encoded in insn[1].imm */
+ continue;
+ }
+
if (!bpf_pseudo_call(insn))
continue;
/* Upon error here we cannot fall back to interpreter but
@@ -11494,6 +12082,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -11534,6 +12123,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn[1].imm;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
@@ -11579,6 +12174,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ continue;
+ }
if (!bpf_pseudo_call(insn))
continue;
insn->off = env->insn_aux_data[i].call_imm;
@@ -11590,7 +12190,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
- bpf_prog_free_unused_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++) {
@@ -11613,7 +12213,7 @@ out_undo_insn:
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
- bpf_prog_free_jited_linfo(prog);
+ bpf_prog_jit_attempt_done(prog);
return err;
}
@@ -11622,6 +12222,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
int i, depth;
#endif
int err = 0;
@@ -11635,6 +12236,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
/* When JIT fails the progs with bpf2bpf calls and tail_calls
* have to be rejected, since interpreter doesn't support them yet.
@@ -11643,6 +12248,14 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return -EINVAL;
}
for (i = 0; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+
if (!bpf_pseudo_call(insn))
continue;
depth = get_callee_stack_depth(env, insn, i);
@@ -11655,12 +12268,30 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-/* fixup insn->imm field of bpf_call instructions
- * and inline eligible helpers as explicit sequence of BPF instructions
- *
- * this function is called after eBPF program passed verification
+static int fixup_kfunc_call(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ /* insn->imm has the btf func_id. Replace it with
+ * an address (relative to __bpf_base_call).
+ */
+ desc = find_kfunc_desc(env->prog, insn->imm);
+ if (!desc) {
+ verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+ insn->imm);
+ return -EFAULT;
+ }
+
+ insn->imm = desc->imm;
+
+ return 0;
+}
+
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
*/
-static int fixup_bpf_calls(struct bpf_verifier_env *env)
+static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
@@ -11675,6 +12306,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, ret, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ /* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
@@ -11715,6 +12347,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
if (BPF_CLASS(insn->code) == BPF_LD &&
(BPF_MODE(insn->code) == BPF_ABS ||
BPF_MODE(insn->code) == BPF_IND)) {
@@ -11734,13 +12367,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn insn_buf[16];
struct bpf_insn *patch = &insn_buf[0];
- bool issrc, isneg;
+ bool issrc, isneg, isimm;
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
@@ -11751,28 +12384,29 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
BPF_ALU_SANITIZE_SRC;
+ isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
off_reg = issrc ? insn->src_reg : insn->dst_reg;
- if (isneg)
- *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
- *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
- *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
- *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
- if (issrc) {
- *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
- off_reg);
- insn->src_reg = BPF_REG_AX;
+ if (isimm) {
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
} else {
- *patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
- BPF_REG_AX);
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
}
+ if (!issrc)
+ *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+ insn->src_reg = BPF_REG_AX;
if (isneg)
insn->code = insn->code == code_add ?
code_sub : code_add;
*patch++ = *insn;
- if (issrc && isneg)
+ if (issrc && isneg && !isimm)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
cnt = patch - insn_buf;
@@ -11790,6 +12424,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = fixup_kfunc_call(env, insn);
+ if (ret)
+ return ret;
+ continue;
+ }
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
@@ -11882,7 +12522,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_delete_elem ||
insn->imm == BPF_FUNC_map_push_elem ||
insn->imm == BPF_FUNC_map_pop_elem ||
- insn->imm == BPF_FUNC_map_peek_elem)) {
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
@@ -11924,6 +12565,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
@@ -11950,11 +12594,16 @@ patch_map_ops_generic:
insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
__bpf_call_base;
continue;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CAST_CALL(ops->map_redirect) -
+ __bpf_call_base;
+ continue;
}
goto patch_call_imm;
}
+ /* Implement bpf_jiffies64 inline. */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_jiffies64) {
struct bpf_insn ld_jiffies_addr[2] = {
@@ -12010,6 +12659,8 @@ patch_call_imm:
}
}
+ sort_kfunc_descs_by_imm(env->prog);
+
return 0;
}
@@ -12120,7 +12771,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
- ret = btf_check_func_arg_match(env, subprog, regs);
+ ret = btf_check_subprog_arg_match(env, subprog, regs);
if (ret == -EFAULT)
/* unlikely verifier bug. abort.
* ret == 0 and ret < 0 are sadly acceptable for
@@ -12720,6 +13371,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (!env->explored_states)
goto skip_full_check;
+ ret = add_subprog_and_kfunc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
@@ -12770,7 +13425,7 @@ skip_full_check:
ret = convert_ctx_accesses(env);
if (ret == 0)
- ret = fixup_bpf_calls(env);
+ ret = do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9153b20e5cc6..e049edd66776 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1339,6 +1339,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
mutex_unlock(&cgroup_mutex);
+ cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
@@ -1751,6 +1752,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
&dcgrp->e_csets[ss->id]);
spin_unlock_irq(&css_set_lock);
+ if (ss->css_rstat_flush) {
+ list_del_rcu(&css->rstat_css_node);
+ list_add_rcu(&css->rstat_css_node,
+ &dcgrp->rstat_css_list);
+ }
+
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -1971,10 +1978,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
- ret = rebind_subsystems(root, ss_mask);
+ ret = cgroup_rstat_init(root_cgrp);
if (ret)
goto destroy_root;
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto exit_stats;
+
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
@@ -2006,6 +2017,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
ret = 0;
goto out;
+exit_stats:
+ cgroup_rstat_exit(root_cgrp);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -4934,8 +4947,7 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4976,8 +4988,7 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_flush(cgrp);
+ cgroup_rstat_flush(cgrp);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5034,7 +5045,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+ if (ss->css_rstat_flush)
list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
BUG_ON(cgroup_css(cgrp, ss));
@@ -5159,11 +5170,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- if (cgroup_on_dfl(parent)) {
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
- }
+ ret = cgroup_rstat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
@@ -5250,8 +5259,7 @@ out_psi_free:
out_kernfs_remove:
kernfs_remove(cgrp->kn);
out_stat_exit:
- if (cgroup_on_dfl(parent))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d51175cedfca..3a3fd2993a65 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -25,13 +25,8 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
- struct cgroup *parent;
unsigned long flags;
- /* nothing to do for root */
- if (!cgroup_parent(cgrp))
- return;
-
/*
* Speculative already-on-list test. This may race leading to
* temporary inaccuracies, which is fine.
@@ -46,10 +41,10 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
raw_spin_lock_irqsave(cpu_lock, flags);
/* put @cgrp and all ancestors on the corresponding updated lists */
- for (parent = cgroup_parent(cgrp); parent;
- cgrp = parent, parent = cgroup_parent(cgrp)) {
+ while (true) {
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
@@ -58,8 +53,17 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (rstatc->updated_next)
break;
+ /* Root has no parent to link it to, but mark it busy */
+ if (!parent) {
+ rstatc->updated_next = cgrp;
+ break;
+ }
+
+ prstatc = cgroup_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
prstatc->updated_children = cgrp;
+
+ cgrp = parent;
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
@@ -113,23 +117,26 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
*/
if (rstatc->updated_next) {
struct cgroup *parent = cgroup_parent(pos);
- struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
- struct cgroup_rstat_cpu *nrstatc;
- struct cgroup **nextp;
-
- nextp = &prstatc->updated_children;
- while (true) {
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
-
- WARN_ON_ONCE(*nextp == parent);
- nextp = &nrstatc->updated_next;
+
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup **nextp;
+
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (true) {
+ struct cgroup_rstat_cpu *nrstatc;
+
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
}
- *nextp = rstatc->updated_next;
rstatc->updated_next = NULL;
-
return pos;
}
@@ -285,8 +292,6 @@ void __init cgroup_rstat_boot(void)
for_each_possible_cpu(cpu)
raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
-
- BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
}
/*
@@ -311,11 +316,15 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
- struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_base_stat cur, delta;
unsigned seq;
+ /* Root-level stats are sourced from system-wide CPU stats */
+ if (!parent)
+ return;
+
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
@@ -328,8 +337,8 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
- /* propagate global delta to parent */
- if (parent) {
+ /* propagate global delta to parent (unless that's root) */
+ if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index d3fd428f4b92..eb701b2ac72f 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,5 +1,4 @@
# KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
# CONFIG_DEVMEM is not set
# CONFIG_FHANDLE is not set
# CONFIG_INET_LRO is not set
diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config
new file mode 100644
index 000000000000..2f0e6bf6db2c
--- /dev/null
+++ b/kernel/configs/tiny-base.config
@@ -0,0 +1 @@
+CONFIG_EMBEDDED=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1b6302ecbabe..e538518556f4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
+ int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -135,6 +136,11 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
return cpuhp_hp_states + state;
}
+static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
+{
+ return bringup ? !step->startup.single : !step->teardown.single;
+}
+
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
@@ -157,26 +163,24 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
if (st->fail == state) {
st->fail = CPUHP_INVALID;
-
- if (!(bringup ? step->startup.single : step->teardown.single))
- return 0;
-
return -EAGAIN;
}
+ if (cpuhp_step_empty(bringup, step)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
if (!step->multi_instance) {
WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
- if (!cb)
- return 0;
+
trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
cbm = bringup ? step->startup.multi : step->teardown.multi;
- if (!cbm)
- return 0;
/* Single invocation for instance add/remove */
if (node) {
@@ -461,13 +465,16 @@ static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;
st->rollback = false;
st->last = NULL;
st->target = target;
st->single = false;
- st->bringup = st->state < target;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);
return prev_state;
}
@@ -475,6 +482,17 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
+ bool bringup = !st->bringup;
+
+ st->target = prev_state;
+
+ /*
+ * Already rolling back. No need invert the bringup value or to change
+ * the current state.
+ */
+ if (st->rollback)
+ return;
+
st->rollback = true;
/*
@@ -488,8 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}
- st->target = prev_state;
- st->bringup = !st->bringup;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -591,10 +610,53 @@ static int finish_cpu(unsigned int cpu)
* Hotplug state machine related functions
*/
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
+/*
+ * Get the next state to run. Empty ones will be skipped. Returns true if a
+ * state must be run.
+ *
+ * st->state will be modified ahead of time, to match state_to_run, as if it
+ * has already ran.
+ */
+static bool cpuhp_next_state(bool bringup,
+ enum cpuhp_state *state_to_run,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ do {
+ if (bringup) {
+ if (st->state >= target)
+ return false;
+
+ *state_to_run = ++st->state;
+ } else {
+ if (st->state <= target)
+ return false;
+
+ *state_to_run = st->state--;
+ }
+
+ if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
+ break;
+ } while (true);
+
+ return true;
+}
+
+static int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
- for (st->state--; st->state > st->target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ enum cpuhp_state state;
+ int err = 0;
+
+ while (cpuhp_next_state(bringup, &state, st, target)) {
+ err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
+ if (err)
+ break;
+ }
+
+ return err;
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -617,16 +679,12 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state prev_state = st->state;
int ret = 0;
- while (st->state < target) {
- st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
- if (ret) {
- if (can_rollback_cpu(st)) {
- st->target = prev_state;
- undo_cpu_up(cpu, st);
- }
- break;
- }
+ ret = cpuhp_invoke_callback_range(true, cpu, st, target);
+ if (ret) {
+ cpuhp_reset_state(st, prev_state);
+ if (can_rollback_cpu(st))
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
+ prev_state));
}
return ret;
}
@@ -640,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)
init_completion(&st->done_up);
init_completion(&st->done_down);
+ st->cpu = cpu;
}
static int cpuhp_should_run(unsigned int cpu)
@@ -690,17 +749,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
state = st->cb_state;
st->should_run = false;
} else {
- if (bringup) {
- st->state++;
- state = st->state;
- st->should_run = (st->state < st->target);
- WARN_ON_ONCE(st->state > st->target);
- } else {
- state = st->state;
- st->state--;
- st->should_run = (st->state > st->target);
- WARN_ON_ONCE(st->state < st->target);
- }
+ st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
+ if (!st->should_run)
+ goto end;
}
WARN_ON_ONCE(!cpuhp_is_ap_state(state));
@@ -728,6 +779,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
}
+end:
cpuhp_lock_release(bringup);
lockdep_release_cpus_lock();
@@ -881,19 +933,18 @@ static int take_cpu_down(void *_param)
return err;
/*
- * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
- * do this step again.
+ * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
+ * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
*/
- WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
- st->state--;
+ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
+
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
- /*
- * DYING must not fail!
- */
- WARN_ON_ONCE(ret);
- }
+ ret = cpuhp_invoke_callback_range(false, cpu, st, target);
+
+ /*
+ * DYING must not fail!
+ */
+ WARN_ON_ONCE(ret);
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -975,27 +1026,22 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++)
- cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
-}
-
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
- if (ret) {
- st->target = prev_state;
- if (st->state < prev_state)
- undo_cpu_down(cpu, st);
- break;
- }
+ ret = cpuhp_invoke_callback_range(false, cpu, st, target);
+ if (ret) {
+
+ cpuhp_reset_state(st, prev_state);
+
+ if (st->state < prev_state)
+ WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
+ prev_state));
}
+
return ret;
}
@@ -1045,9 +1091,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, target);
- if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- cpuhp_reset_state(st, prev_state);
- __cpuhp_kick_ap(st);
+ if (ret && st->state < prev_state) {
+ if (st->state == CPUHP_TEARDOWN_CPU) {
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
+ } else {
+ WARN(1, "DEAD callback error for CPU%d", cpu);
+ }
}
out:
@@ -1164,14 +1214,12 @@ void notify_cpu_starting(unsigned int cpu)
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- while (st->state < target) {
- st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
- /*
- * STARTING must not fail!
- */
- WARN_ON_ONCE(ret);
- }
+ ret = cpuhp_invoke_callback_range(true, cpu, st, target);
+
+ /*
+ * STARTING must not fail!
+ */
+ WARN_ON_ONCE(ret);
}
/*
@@ -1777,8 +1825,7 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
* If there's nothing to do, we done.
* Relies on the union for multi_instance.
*/
- if ((bringup && !sp->startup.single) ||
- (!bringup && !sp->teardown.single))
+ if (cpuhp_step_empty(bringup, sp))
return 0;
/*
* The non AP bound callbacks can fail on bringup. On teardown
@@ -2207,6 +2254,11 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (ret)
return ret;
+ if (fail == CPUHP_INVALID) {
+ st->fail = fail;
+ return count;
+ }
+
if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
return -EINVAL;
@@ -2217,6 +2269,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
return -EINVAL;
/*
+ * DEAD callbacks cannot fail...
+ * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
+ * triggering STARTING callbacks, a failure in this state would
+ * hinder rollback.
+ */
+ if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
+ return -EINVAL;
+
+ /*
* Cannot fail anything that doesn't have callbacks.
*/
mutex_lock(&cpuhp_state_mutex);
@@ -2460,6 +2521,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
+struct cpumask __cpu_dying_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_dying_mask);
+
atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);
diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..e1d274cd741b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -33,7 +33,7 @@ do { \
static struct kmem_cache *cred_jar;
/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
/*
* The initial credentials for the initial task
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 002268262c9a..f737e3347059 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -344,8 +344,8 @@ void dma_direct_sync_sg_for_device(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, sg->length,
- dir, SYNC_FOR_DEVICE);
+ swiotlb_sync_single_for_device(dev, paddr, sg->length,
+ dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, sg->length,
@@ -370,8 +370,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu(paddr, sg->length, dir);
if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
- SYNC_FOR_CPU);
+ swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
+ dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, sg->length);
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index b98615578737..50afc05b6f1d 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -57,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, addr);
if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+ swiotlb_sync_single_for_device(dev, paddr, size, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(paddr, size, dir);
@@ -74,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
}
if (unlikely(is_swiotlb_buffer(paddr)))
- swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+ swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
if (dir == DMA_FROM_DEVICE)
arch_dma_mark_clean(paddr, size);
@@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index e0e64f8b0739..9b9af1bd6be3 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright (C) 2020 Hisilicon Limited.
+ * Copyright (C) 2020 HiSilicon Limited.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -38,7 +38,8 @@ struct map_benchmark {
__u32 dma_bits; /* DMA addressing capability */
__u32 dma_dir; /* DMA data direction */
__u32 dma_trans_ns; /* time for DMA transmission in ns */
- __u8 expansion[80]; /* For future use */
+ __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
+ __u8 expansion[76]; /* For future use */
};
struct map_benchmark_data {
@@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data)
void *buf;
dma_addr_t dma_addr;
struct map_benchmark_data *map = data;
+ int npages = map->bparam.granule;
+ u64 size = npages * PAGE_SIZE;
int ret = 0;
- buf = (void *)__get_free_page(GFP_KERNEL);
+ buf = alloc_pages_exact(size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data)
* 66 means evertything goes well! 66 is lucky.
*/
if (map->dir != DMA_FROM_DEVICE)
- memset(buf, 0x66, PAGE_SIZE);
+ memset(buf, 0x66, size);
map_stime = ktime_get();
- dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
+ dma_addr = dma_map_single(map->dev, buf, size, map->dir);
if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
pr_err("dma_map_single failed on %s\n",
dev_name(map->dev));
@@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data)
ndelay(map->bparam.dma_trans_ns);
unmap_stime = ktime_get();
- dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
+ dma_unmap_single(map->dev, dma_addr, size, map->dir);
unmap_etime = ktime_get();
unmap_delta = ktime_sub(unmap_etime, unmap_stime);
@@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data)
}
out:
- free_page((unsigned long)buf);
+ free_pages_exact(buf, size);
return ret;
}
@@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
struct map_benchmark_data *map = file->private_data;
void __user *argp = (void __user *)arg;
u64 old_dma_mask;
-
int ret;
if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
@@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
return -EINVAL;
}
+ if (map->bparam.granule < 1 || map->bparam.granule > 1024) {
+ pr_err("invalid granule size\n");
+ return -EINVAL;
+ }
+
switch (map->bparam.dma_dir) {
case DMA_MAP_BIDIRECTIONAL:
map->dir = DMA_BIDIRECTIONAL;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b6a633679933..2b06a809d0b9 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -477,11 +477,10 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
}
EXPORT_SYMBOL(dma_free_attrs);
-struct page *dma_alloc_pages(struct device *dev, size_t size,
+static struct page *__dma_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- struct page *page;
if (WARN_ON_ONCE(!dev->coherent_dma_mask))
return NULL;
@@ -490,33 +489,162 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
- page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
- else if (ops->alloc_pages)
- page = ops->alloc_pages(dev, size, dma_handle, dir, gfp);
- else
+ return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (!ops->alloc_pages)
return NULL;
+ return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+}
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (page)
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
-void dma_free_pages(struct device *dev, size_t size, struct page *page,
+static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
dma_addr_t dma_handle, enum dma_data_direction dir)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
size = PAGE_ALIGN(size);
- debug_dma_unmap_page(dev, dma_handle, size, dir);
-
if (dma_alloc_direct(dev, ops))
dma_direct_free_pages(dev, size, page, dma_handle, dir);
else if (ops->free_pages)
ops->free_pages(dev, size, page, dma_handle, dir);
}
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ debug_dma_unmap_page(dev, dma_handle, size, dir);
+ __dma_free_pages(dev, size, page, dma_handle, dir);
+}
EXPORT_SYMBOL_GPL(dma_free_pages);
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct page *page)
+{
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff)
+ return -ENXIO;
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(page) + vma->vm_pgoff,
+ vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot);
+}
+EXPORT_SYMBOL_GPL(dma_mmap_pages);
+
+static struct sg_table *alloc_single_sgt(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp)
+{
+ struct sg_table *sgt;
+ struct page *page;
+
+ sgt = kmalloc(sizeof(*sgt), gfp);
+ if (!sgt)
+ return NULL;
+ if (sg_alloc_table(sgt, 1, gfp))
+ goto out_free_sgt;
+ page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp);
+ if (!page)
+ goto out_free_table;
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ sg_dma_len(sgt->sgl) = sgt->sgl->length;
+ return sgt;
+out_free_table:
+ sg_free_table(sgt);
+out_free_sgt:
+ kfree(sgt);
+ return NULL;
+}
+
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ struct sg_table *sgt;
+
+ if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
+ return NULL;
+
+ if (ops && ops->alloc_noncontiguous)
+ sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
+ else
+ sgt = alloc_single_sgt(dev, size, dir, gfp);
+
+ if (sgt) {
+ sgt->nents = 1;
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+ }
+ return sgt;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
+
+static void free_single_sgt(struct device *dev, size_t size,
+ struct sg_table *sgt, enum dma_data_direction dir)
+{
+ __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address,
+ dir);
+ sg_free_table(sgt);
+ kfree(sgt);
+}
+
+void dma_free_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
+ if (ops && ops->free_noncontiguous)
+ ops->free_noncontiguous(dev, size, sgt, dir);
+ else
+ free_single_sgt(dev, size, sgt, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
+
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (ops && ops->alloc_noncontiguous)
+ return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+ return page_address(sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
+
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (ops && ops->alloc_noncontiguous)
+ vunmap(vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
+
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct sg_table *sgt)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (ops && ops->alloc_noncontiguous) {
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (vma->vm_pgoff >= count ||
+ vma_pages(vma) > count - vma->vm_pgoff)
+ return -ENXIO;
+ return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
+ }
+ return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
+
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 905c3fa005f1..b4526668072e 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -66,6 +66,5 @@ void dma_common_free_remap(void *cpu_addr, size_t size)
return;
}
- unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
vunmap(cpu_addr);
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c10e855a03bc..8ca7d505d61c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -59,32 +59,11 @@
*/
#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
-enum swiotlb_force swiotlb_force;
-
-/*
- * Used to do a quick range check in swiotlb_tbl_unmap_single and
- * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
- * API.
- */
-phys_addr_t io_tlb_start, io_tlb_end;
-
-/*
- * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
- * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
- */
-static unsigned long io_tlb_nslabs;
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-/*
- * The number of used IO TLB block
- */
-static unsigned long io_tlb_used;
+enum swiotlb_force swiotlb_force;
-/*
- * This is a free list describing the number of free entries available from
- * each index
- */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
+struct io_tlb_mem *io_tlb_default_mem;
/*
* Max segment that we can provide which (if pages are contingous) will
@@ -92,57 +71,30 @@ static unsigned int io_tlb_index;
*/
static unsigned int max_segment;
-/*
- * We need to save away the original address corresponding to a mapped entry
- * for the sync operations.
- */
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-static phys_addr_t *io_tlb_orig_addr;
-
-/*
- * The mapped buffer's size should be validated during a sync operation.
- */
-static size_t *io_tlb_orig_size;
-
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
-
-static int late_alloc;
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static int __init
setup_io_tlb_npages(char *str)
{
if (isdigit(*str)) {
- io_tlb_nslabs = simple_strtoul(str, &str, 0);
/* avoid tail segment of size < IO_TLB_SEGSIZE */
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ default_nslabs =
+ ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
}
if (*str == ',')
++str;
- if (!strcmp(str, "force")) {
+ if (!strcmp(str, "force"))
swiotlb_force = SWIOTLB_FORCE;
- } else if (!strcmp(str, "noforce")) {
+ else if (!strcmp(str, "noforce"))
swiotlb_force = SWIOTLB_NO_FORCE;
- io_tlb_nslabs = 1;
- }
return 0;
}
early_param("swiotlb", setup_io_tlb_npages);
-static bool no_iotlb_memory;
-
-unsigned long swiotlb_nr_tbl(void)
-{
- return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;
-}
-EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
-
unsigned int swiotlb_max_segment(void)
{
- return unlikely(no_iotlb_memory) ? 0 : max_segment;
+ return io_tlb_default_mem ? max_segment : 0;
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);
@@ -156,42 +108,34 @@ void swiotlb_set_max_segment(unsigned int val)
unsigned long swiotlb_size_or_default(void)
{
- unsigned long size;
-
- size = io_tlb_nslabs << IO_TLB_SHIFT;
-
- return size ? size : (IO_TLB_DEFAULT_SIZE);
+ return default_nslabs << IO_TLB_SHIFT;
}
-void __init swiotlb_adjust_size(unsigned long new_size)
+void __init swiotlb_adjust_size(unsigned long size)
{
- unsigned long size;
-
/*
* If swiotlb parameter has not been specified, give a chance to
* architectures such as those supporting memory encryption to
* adjust/expand SWIOTLB size for their use.
*/
- if (!io_tlb_nslabs) {
- size = ALIGN(new_size, IO_TLB_SIZE);
- io_tlb_nslabs = size >> IO_TLB_SHIFT;
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-
- pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
- }
+ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
+ return;
+ size = ALIGN(size, IO_TLB_SIZE);
+ default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}
void swiotlb_print_info(void)
{
- unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ struct io_tlb_mem *mem = io_tlb_default_mem;
- if (no_iotlb_memory) {
+ if (!mem) {
pr_warn("No low mem\n");
return;
}
- pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end,
- bytes >> 20);
+ pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
+ (mem->nslabs << IO_TLB_SHIFT) >> 20);
}
static inline unsigned long io_tlb_offset(unsigned long val)
@@ -212,64 +156,51 @@ static inline unsigned long nr_slots(u64 val)
*/
void __init swiotlb_update_mem_attributes(void)
{
+ struct io_tlb_mem *mem = io_tlb_default_mem;
void *vaddr;
unsigned long bytes;
- if (no_iotlb_memory || late_alloc)
+ if (!mem || mem->late_alloc)
return;
-
- vaddr = phys_to_virt(io_tlb_start);
- bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+ vaddr = phys_to_virt(mem->start);
+ bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
memset(vaddr, 0, bytes);
}
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
{
- unsigned long i, bytes;
+ unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+ struct io_tlb_mem *mem;
size_t alloc_size;
- bytes = nslabs << IO_TLB_SHIFT;
-
- io_tlb_nslabs = nslabs;
- io_tlb_start = __pa(tlb);
- io_tlb_end = io_tlb_start + bytes;
-
- /*
- * Allocate and initialize the free list array. This array is used
- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
- * between io_tlb_start and io_tlb_end.
- */
- alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int));
- io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!io_tlb_list)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ return 0;
- alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t));
- io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!io_tlb_orig_addr)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
+ /* protect against double initialization */
+ if (WARN_ON_ONCE(io_tlb_default_mem))
+ return -ENOMEM;
- alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
- io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!io_tlb_orig_size)
+ alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs));
+ mem = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
-
- for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- io_tlb_orig_size[i] = 0;
+ mem->nslabs = nslabs;
+ mem->start = __pa(tlb);
+ mem->end = mem->start + bytes;
+ mem->index = 0;
+ spin_lock_init(&mem->lock);
+ for (i = 0; i < mem->nslabs; i++) {
+ mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
}
- io_tlb_index = 0;
- no_iotlb_memory = false;
+ io_tlb_default_mem = mem;
if (verbose)
swiotlb_print_info();
-
- swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+ swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
}
@@ -280,29 +211,24 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
void __init
swiotlb_init(int verbose)
{
- size_t default_size = IO_TLB_DEFAULT_SIZE;
- unsigned char *vstart;
- unsigned long bytes;
-
- if (!io_tlb_nslabs) {
- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
- }
-
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+ void *tlb;
- /* Get IO TLB memory from the low pages */
- vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
- if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
return;
- if (io_tlb_start) {
- memblock_free_early(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
- io_tlb_start = 0;
- }
+ /* Get IO TLB memory from the low pages */
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+ if (!tlb)
+ goto fail;
+ if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
+ goto fail_free_mem;
+ return;
+
+fail_free_mem:
+ memblock_free_early(__pa(tlb), bytes);
+fail:
pr_warn("Cannot allocate buffer");
- no_iotlb_memory = true;
}
/*
@@ -313,22 +239,22 @@ swiotlb_init(int verbose)
int
swiotlb_late_init_with_default_size(size_t default_size)
{
- unsigned long bytes, req_nslabs = io_tlb_nslabs;
+ unsigned long nslabs =
+ ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ unsigned long bytes;
unsigned char *vstart = NULL;
unsigned int order;
int rc = 0;
- if (!io_tlb_nslabs) {
- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
- }
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ return 0;
/*
* Get IO TLB memory from the low pages
*/
- order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
- io_tlb_nslabs = SLABS_PER_PAGE << order;
- bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+ order = get_order(nslabs << IO_TLB_SHIFT);
+ nslabs = SLABS_PER_PAGE << order;
+ bytes = nslabs << IO_TLB_SHIFT;
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
@@ -338,134 +264,99 @@ swiotlb_late_init_with_default_size(size_t default_size)
order--;
}
- if (!vstart) {
- io_tlb_nslabs = req_nslabs;
+ if (!vstart)
return -ENOMEM;
- }
+
if (order != get_order(bytes)) {
pr_warn("only able to allocate %ld MB\n",
(PAGE_SIZE << order) >> 20);
- io_tlb_nslabs = SLABS_PER_PAGE << order;
+ nslabs = SLABS_PER_PAGE << order;
}
- rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
+ rc = swiotlb_late_init_with_tbl(vstart, nslabs);
if (rc)
free_pages((unsigned long)vstart, order);
return rc;
}
-static void swiotlb_cleanup(void)
-{
- io_tlb_end = 0;
- io_tlb_start = 0;
- io_tlb_nslabs = 0;
- max_segment = 0;
-}
-
int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
- unsigned long i, bytes;
+ unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+ struct io_tlb_mem *mem;
- bytes = nslabs << IO_TLB_SHIFT;
+ if (swiotlb_force == SWIOTLB_NO_FORCE)
+ return 0;
- io_tlb_nslabs = nslabs;
- io_tlb_start = virt_to_phys(tlb);
- io_tlb_end = io_tlb_start + bytes;
+ /* protect against double initialization */
+ if (WARN_ON_ONCE(io_tlb_default_mem))
+ return -ENOMEM;
- set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
- memset(tlb, 0, bytes);
+ mem = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(struct_size(mem, slots, nslabs)));
+ if (!mem)
+ return -ENOMEM;
- /*
- * Allocate and initialize the free list array. This array is used
- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
- * between io_tlb_start and io_tlb_end.
- */
- io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs * sizeof(int)));
- if (!io_tlb_list)
- goto cleanup3;
-
- io_tlb_orig_addr = (phys_addr_t *)
- __get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs *
- sizeof(phys_addr_t)));
- if (!io_tlb_orig_addr)
- goto cleanup4;
-
- io_tlb_orig_size = (size_t *)
- __get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs *
- sizeof(size_t)));
- if (!io_tlb_orig_size)
- goto cleanup5;
-
-
- for (i = 0; i < io_tlb_nslabs; i++) {
- io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- io_tlb_orig_size[i] = 0;
+ mem->nslabs = nslabs;
+ mem->start = virt_to_phys(tlb);
+ mem->end = mem->start + bytes;
+ mem->index = 0;
+ mem->late_alloc = 1;
+ spin_lock_init(&mem->lock);
+ for (i = 0; i < mem->nslabs; i++) {
+ mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
}
- io_tlb_index = 0;
- no_iotlb_memory = false;
-
- swiotlb_print_info();
- late_alloc = 1;
-
- swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+ set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
+ memset(tlb, 0, bytes);
+ io_tlb_default_mem = mem;
+ swiotlb_print_info();
+ swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
-
-cleanup5:
- free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
- sizeof(phys_addr_t)));
-
-cleanup4:
- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
- sizeof(int)));
- io_tlb_list = NULL;
-cleanup3:
- swiotlb_cleanup();
- return -ENOMEM;
}
void __init swiotlb_exit(void)
{
- if (!io_tlb_orig_addr)
+ struct io_tlb_mem *mem = io_tlb_default_mem;
+ size_t size;
+
+ if (!mem)
return;
- if (late_alloc) {
- free_pages((unsigned long)io_tlb_orig_size,
- get_order(io_tlb_nslabs * sizeof(size_t)));
- free_pages((unsigned long)io_tlb_orig_addr,
- get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
- sizeof(int)));
- free_pages((unsigned long)phys_to_virt(io_tlb_start),
- get_order(io_tlb_nslabs << IO_TLB_SHIFT));
- } else {
- memblock_free_late(__pa(io_tlb_orig_addr),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
- memblock_free_late(__pa(io_tlb_orig_size),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
- memblock_free_late(__pa(io_tlb_list),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
- memblock_free_late(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
- }
- swiotlb_cleanup();
+ size = struct_size(mem, slots, mem->nslabs);
+ if (mem->late_alloc)
+ free_pages((unsigned long)mem, get_order(size));
+ else
+ memblock_free_late(__pa(mem), PAGE_ALIGN(size));
+ io_tlb_default_mem = NULL;
}
/*
* Bounce: copy the swiotlb buffer from or back to the original dma location
*/
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir)
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+ enum dma_data_direction dir)
{
+ struct io_tlb_mem *mem = io_tlb_default_mem;
+ int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = mem->slots[index].orig_addr;
+ size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
unsigned char *vaddr = phys_to_virt(tlb_addr);
+ if (orig_addr == INVALID_PHYS_ADDR)
+ return;
+
+ if (size > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+ alloc_size, size);
+ size = alloc_size;
+ }
+
if (PageHighMem(pfn_to_page(pfn))) {
/* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
@@ -517,9 +408,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
return nr_slots(boundary_mask + 1);
}
-static unsigned int wrap_index(unsigned int index)
+static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
{
- if (index >= io_tlb_nslabs)
+ if (index >= mem->nslabs)
return 0;
return index;
}
@@ -531,9 +422,10 @@ static unsigned int wrap_index(unsigned int index)
static int find_slots(struct device *dev, phys_addr_t orig_addr,
size_t alloc_size)
{
+ struct io_tlb_mem *mem = io_tlb_default_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
- phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+ phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
@@ -552,15 +444,15 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
if (alloc_size >= PAGE_SIZE)
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
- spin_lock_irqsave(&io_tlb_lock, flags);
- if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+ spin_lock_irqsave(&mem->lock, flags);
+ if (unlikely(nslots > mem->nslabs - mem->used))
goto not_found;
- index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
+ index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
do {
if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
(orig_addr & iotlb_align_mask)) {
- index = wrap_index(index + 1);
+ index = wrap_index(mem, index + 1);
continue;
}
@@ -572,34 +464,34 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
if (!iommu_is_span_boundary(index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (io_tlb_list[index] >= nslots)
+ if (mem->slots[index].list >= nslots)
goto found;
}
- index = wrap_index(index + stride);
+ index = wrap_index(mem, index + stride);
} while (index != wrap);
not_found:
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ spin_unlock_irqrestore(&mem->lock, flags);
return -1;
found:
for (i = index; i < index + nslots; i++)
- io_tlb_list[i] = 0;
+ mem->slots[i].list = 0;
for (i = index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
- io_tlb_list[i]; i--)
- io_tlb_list[i] = ++count;
+ mem->slots[i].list; i--)
+ mem->slots[i].list = ++count;
/*
* Update the indices to avoid searching in the next round.
*/
- if (index + nslots < io_tlb_nslabs)
- io_tlb_index = index + nslots;
+ if (index + nslots < mem->nslabs)
+ mem->index = index + nslots;
else
- io_tlb_index = 0;
- io_tlb_used += nslots;
+ mem->index = 0;
+ mem->used += nslots;
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ spin_unlock_irqrestore(&mem->lock, flags);
return index;
}
@@ -607,11 +499,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
enum dma_data_direction dir, unsigned long attrs)
{
+ struct io_tlb_mem *mem = io_tlb_default_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
- unsigned int index, i;
+ unsigned int i;
+ int index;
phys_addr_t tlb_addr;
- if (no_iotlb_memory)
+ if (!mem)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
if (mem_encrypt_active())
@@ -628,7 +522,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, io_tlb_nslabs, io_tlb_used);
+ alloc_size, mem->nslabs, mem->used);
return (phys_addr_t)DMA_MAPPING_ERROR;
}
@@ -638,49 +532,37 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* needed.
*/
for (i = 0; i < nr_slots(alloc_size + offset); i++) {
- io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
- io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+ mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ mem->slots[index + i].alloc_size =
+ alloc_size - (i << IO_TLB_SHIFT);
}
- tlb_addr = slot_addr(io_tlb_start, index) + offset;
+ tlb_addr = slot_addr(mem->start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
-{
- if (*size > orig_size) {
- /* Warn and truncate mapping_size */
- dev_WARN_ONCE(hwdev, 1,
- "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
- orig_size, *size);
- *size = orig_size;
- }
-}
-
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
{
+ struct io_tlb_mem *mem = io_tlb_default_mem;
unsigned long flags;
unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
- int i, count, nslots = nr_slots(alloc_size + offset);
- int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t orig_addr = io_tlb_orig_addr[index];
-
- validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+ int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+ int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ int count, i;
/*
* First, sync the memory before unmapping the entry
*/
- if (orig_addr != INVALID_PHYS_ADDR &&
- !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
/*
* Return the buffer to the free list by setting the corresponding
@@ -688,9 +570,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* While returning the entries to the free list, we merge the entries
* with slots below and above the pool being returned.
*/
- spin_lock_irqsave(&io_tlb_lock, flags);
+ spin_lock_irqsave(&mem->lock, flags);
if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
- count = io_tlb_list[index + nslots];
+ count = mem->slots[index + nslots].list;
else
count = 0;
@@ -699,9 +581,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* superceeding slots
*/
for (i = index + nslots - 1; i >= index; i--) {
- io_tlb_list[i] = ++count;
- io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
- io_tlb_orig_size[i] = 0;
+ mem->slots[i].list = ++count;
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
}
/*
@@ -709,44 +591,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
* available (non zero)
*/
for (i = index - 1;
- io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
i--)
- io_tlb_list[i] = ++count;
- io_tlb_used -= nslots;
- spin_unlock_irqrestore(&io_tlb_lock, flags);
+ mem->slots[i].list = ++count;
+ mem->used -= nslots;
+ spin_unlock_irqrestore(&mem->lock, flags);
}
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir,
- enum dma_sync_target target)
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
{
- int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
- size_t orig_size = io_tlb_orig_size[index];
- phys_addr_t orig_addr = io_tlb_orig_addr[index];
-
- if (orig_addr == INVALID_PHYS_ADDR)
- return;
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ else
+ BUG_ON(dir != DMA_FROM_DEVICE);
+}
- validate_sync_size_and_truncate(hwdev, orig_size, &size);
-
- switch (target) {
- case SYNC_FOR_CPU:
- if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr,
- size, DMA_FROM_DEVICE);
- else
- BUG_ON(dir != DMA_TO_DEVICE);
- break;
- case SYNC_FOR_DEVICE:
- if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr,
- size, DMA_TO_DEVICE);
- else
- BUG_ON(dir != DMA_FROM_DEVICE);
- break;
- default:
- BUG();
- }
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+ else
+ BUG_ON(dir != DMA_TO_DEVICE);
}
/*
@@ -770,7 +637,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
/* Ensure that the address returned is DMA'ble */
dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
- swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
+ swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
dev_WARN_ONCE(dev, 1,
"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
@@ -790,22 +657,21 @@ size_t swiotlb_max_mapping_size(struct device *dev)
bool is_swiotlb_active(void)
{
- /*
- * When SWIOTLB is initialized, even if io_tlb_start points to physical
- * address zero, io_tlb_end surely doesn't.
- */
- return io_tlb_end != 0;
+ return io_tlb_default_mem != NULL;
}
+EXPORT_SYMBOL_GPL(is_swiotlb_active);
#ifdef CONFIG_DEBUG_FS
static int __init swiotlb_create_debugfs(void)
{
- struct dentry *root;
+ struct io_tlb_mem *mem = io_tlb_default_mem;
- root = debugfs_create_dir("swiotlb", NULL);
- debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
- debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
+ if (!mem)
+ return 0;
+ mem->debugfs = debugfs_create_dir("swiotlb", NULL);
+ debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
+ debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 48ea8863183b..6fee4a7e88d7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -581,11 +581,6 @@ static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
-extern __weak const char *perf_pmu_name(void)
-{
- return "pmu";
-}
-
static inline u64 perf_clock(void)
{
return local_clock();
diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..fd1c04193e18 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -162,6 +162,7 @@ static void __exit_signal(struct task_struct *tsk)
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
}
+ exit_task_sigqueue_cache(tsk);
}
static void delayed_put_task_struct(struct rcu_head *rhp)
@@ -1439,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
TASK_INTERRUPTIBLE, p);
}
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+ struct task_struct *target)
+{
+ struct task_struct *parent =
+ !ptrace ? target->real_parent : target->parent;
+
+ return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+ same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+ bool ptrace;
+ struct task_struct *target;
+ int retval;
+
+ ptrace = false;
+ target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+ if (target && is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ ptrace = true;
+ target = pid_task(wo->wo_pid, PIDTYPE_PID);
+ if (target && target->ptrace &&
+ is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
+}
+
static long do_wait(struct wait_opts *wo)
{
- struct task_struct *tsk;
int retval;
trace_sched_process_wait(wo->wo_pid);
@@ -1463,19 +1503,27 @@ repeat:
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
- retval = do_wait_thread(wo, tsk);
- if (retval)
- goto end;
- retval = ptrace_do_wait(wo, tsk);
+ if (wo->wo_type == PIDTYPE_PID) {
+ retval = do_wait_pid(wo);
if (retval)
goto end;
+ } else {
+ struct task_struct *tsk = current;
+
+ do {
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
+ goto end;
- if (wo->wo_flags & __WNOTHREAD)
- break;
- } while_each_thread(current, tsk);
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ goto end;
+
+ if (wo->wo_flags & __WNOTHREAD)
+ break;
+ } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);
notask:
diff --git a/kernel/fork.c b/kernel/fork.c
index f592c9a0272a..dc06afd725cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
+#include <linux/bpf.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -379,14 +380,17 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
void *stack = task_stack_page(tsk);
struct vm_struct *vm = task_stack_vm_area(tsk);
+ if (vm) {
+ int i;
- /* All stack pages are in the same node. */
- if (vm)
- mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
- else
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+ account * (PAGE_SIZE / 1024));
+ } else {
+ /* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
+ }
}
static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -734,6 +738,7 @@ void __put_task_struct(struct task_struct *tsk)
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
+ bpf_task_storage_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -927,6 +932,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
+ tsk->pf_io_worker = NULL;
account_kernel_stack(tsk, 1);
@@ -1139,7 +1145,7 @@ void mmput_async(struct mm_struct *mm)
* invocations: in mmput() nobody alive left, in execve task is single
* threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
* mm->exe_file, but does so without using set_mm_exe_file() in order
- * to do avoid the need for any locks.
+ * to avoid the need for any locks.
*/
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
@@ -1390,7 +1396,6 @@ fail_nomem:
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
- int retval;
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
@@ -1417,21 +1422,15 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
- goto good_mm;
+ } else {
+ mm = dup_mm(tsk, current->mm);
+ if (!mm)
+ return -ENOMEM;
}
- retval = -ENOMEM;
- mm = dup_mm(tsk, current->mm);
- if (!mm)
- goto fail_nomem;
-
-good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
-
-fail_nomem:
- return retval;
}
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
@@ -1737,7 +1736,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
* /proc/<pid>/status where Pid and NSpid are always shown relative to
* the pid namespace of the procfs instance. The difference becomes
* obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestoral relation is
+ * different branch of the tree, i.e. where no ancestral relation is
* present between the pid namespaces:
* - create two new pid namespaces ns1 and ns2 in the initial pid
* namespace (also take care to create new mount namespaces in the
@@ -1941,7 +1940,7 @@ static __latent_entropy struct task_struct *copy_process(
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
retval = -ERESTARTNOINTR;
- if (signal_pending(current))
+ if (task_sigpending(current))
goto fork_out;
retval = -ENOMEM;
@@ -2009,6 +2008,7 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
+ p->sigqueue_cache = NULL;
p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
@@ -2078,6 +2078,9 @@ static __latent_entropy struct task_struct *copy_process(
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ RCU_INIT_POINTER(p->bpf_storage, NULL);
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
@@ -2725,8 +2728,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
return false;
/*
- * - make the CLONE_DETACHED bit reuseable for clone3
- * - make the CSIGNAL bits reuseable for clone3
+ * - make the CLONE_DETACHED bit reusable for clone3
+ * - make the CSIGNAL bits reusable for clone3
*/
if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
return false;
diff --git a/kernel/futex.c b/kernel/futex.c
index c98b825da9cf..4938a00bc785 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3710,8 +3710,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
- cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}
@@ -3758,42 +3757,52 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return -ENOSYS;
}
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+ switch (cmd) {
+ case FUTEX_WAIT:
+ case FUTEX_LOCK_PI:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ return true;
+ }
+ return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+ if (!timespec64_valid(ts))
+ return -EINVAL;
+
+ *t = timespec64_to_ktime(*ts);
+ if (cmd == FUTEX_WAIT)
+ *t = ktime_add_safe(ktime_get(), *t);
+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+ return 0;
+}
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
const struct __kernel_timespec __user *, utime,
u32 __user *, uaddr2, u32, val3)
{
- struct timespec64 ts;
+ int ret, cmd = op & FUTEX_CMD_MASK;
ktime_t t, *tp = NULL;
- u32 val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
+ struct timespec64 ts;
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (utime && futex_cmd_has_timeout(cmd)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT;
if (get_timespec64(&ts, utime))
return -EFAULT;
- if (!timespec64_valid(&ts))
- return -EINVAL;
-
- t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- else if (!(op & FUTEX_CLOCK_REALTIME))
- t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
tp = &t;
}
- /*
- * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
- * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
- */
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#ifdef CONFIG_COMPAT
@@ -3959,31 +3968,20 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
- struct timespec64 ts;
+ int ret, cmd = op & FUTEX_CMD_MASK;
ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
+ struct timespec64 ts;
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (utime && futex_cmd_has_timeout(cmd)) {
if (get_old_timespec32(&ts, utime))
return -EFAULT;
- if (!timespec64_valid(&ts))
- return -EINVAL;
-
- t = timespec64_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- else if (!(op & FUTEX_CLOCK_REALTIME))
- t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
tp = &t;
}
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index f62de2dea8a3..58f87a3092f3 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
+ depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
select CONSTRUCTORS
default n
help
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 0ffe9f194080..073a3738c5e6 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -49,6 +49,55 @@ void gcov_enable_events(void)
mutex_unlock(&gcov_lock);
}
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
#ifdef CONFIG_MODULES
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index c466c7fbdece..cbb0bed958ab 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -48,9 +48,8 @@
#include <linux/list.h>
#include <linux/printk.h>
#include <linux/ratelimit.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "gcov.h"
typedef void (*llvm_gcov_callback)(void);
@@ -70,16 +69,10 @@ struct gcov_fn_info {
u32 ident;
u32 checksum;
-#if CONFIG_CLANG_VERSION < 110000
- u8 use_extra_checksum;
-#endif
u32 cfg_checksum;
u32 num_counters;
u64 *counters;
-#if CONFIG_CLANG_VERSION < 110000
- const char *function_name;
-#endif
};
static struct gcov_info *current_info;
@@ -109,16 +102,6 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
}
EXPORT_SYMBOL(llvm_gcov_init);
-#if CONFIG_CLANG_VERSION < 110000
-void llvm_gcda_start_file(const char *orig_filename, const char version[4],
- u32 checksum)
-{
- current_info->filename = orig_filename;
- memcpy(&current_info->version, version, sizeof(current_info->version));
- current_info->checksum = checksum;
-}
-EXPORT_SYMBOL(llvm_gcda_start_file);
-#else
void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
{
current_info->filename = orig_filename;
@@ -126,28 +109,7 @@ void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
current_info->checksum = checksum;
}
EXPORT_SYMBOL(llvm_gcda_start_file);
-#endif
-
-#if CONFIG_CLANG_VERSION < 110000
-void llvm_gcda_emit_function(u32 ident, const char *function_name,
- u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
-{
- struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
-
- if (!info)
- return;
-
- INIT_LIST_HEAD(&info->head);
- info->ident = ident;
- info->checksum = func_checksum;
- info->use_extra_checksum = use_extra_checksum;
- info->cfg_checksum = cfg_checksum;
- if (function_name)
- info->function_name = kstrdup(function_name, GFP_KERNEL);
- list_add_tail(&info->head, &current_info->functions);
-}
-#else
void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
{
struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -161,7 +123,6 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
info->cfg_checksum = cfg_checksum;
list_add_tail(&info->head, &current_info->functions);
}
-#endif
EXPORT_SYMBOL(llvm_gcda_emit_function);
void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
@@ -292,16 +253,8 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
!list_is_last(&fn_ptr2->head, &info2->functions)) {
if (fn_ptr1->checksum != fn_ptr2->checksum)
return false;
-#if CONFIG_CLANG_VERSION < 110000
- if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
- return false;
- if (fn_ptr1->use_extra_checksum &&
- fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
- return false;
-#else
if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
return false;
-#endif
fn_ptr1 = list_next_entry(fn_ptr1, head);
fn_ptr2 = list_next_entry(fn_ptr2, head);
}
@@ -330,35 +283,6 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
}
}
-#if CONFIG_CLANG_VERSION < 110000
-static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
-{
- size_t cv_size; /* counter values size */
- struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
- GFP_KERNEL);
- if (!fn_dup)
- return NULL;
- INIT_LIST_HEAD(&fn_dup->head);
-
- fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
- if (!fn_dup->function_name)
- goto err_name;
-
- cv_size = fn->num_counters * sizeof(fn->counters[0]);
- fn_dup->counters = vmalloc(cv_size);
- if (!fn_dup->counters)
- goto err_counters;
- memcpy(fn_dup->counters, fn->counters, cv_size);
-
- return fn_dup;
-
-err_counters:
- kfree(fn_dup->function_name);
-err_name:
- kfree(fn_dup);
- return NULL;
-}
-#else
static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
{
size_t cv_size; /* counter values size */
@@ -369,7 +293,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
INIT_LIST_HEAD(&fn_dup->head);
cv_size = fn->num_counters * sizeof(fn->counters[0]);
- fn_dup->counters = vmalloc(cv_size);
+ fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL);
if (!fn_dup->counters) {
kfree(fn_dup);
return NULL;
@@ -379,7 +303,6 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
return fn_dup;
}
-#endif
/**
* gcov_info_dup - duplicate profiling data set
@@ -420,99 +343,18 @@ err:
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
-#if CONFIG_CLANG_VERSION < 110000
-void gcov_info_free(struct gcov_info *info)
-{
- struct gcov_fn_info *fn, *tmp;
-
- list_for_each_entry_safe(fn, tmp, &info->functions, head) {
- kfree(fn->function_name);
- vfree(fn->counters);
- list_del(&fn->head);
- kfree(fn);
- }
- kfree(info->filename);
- kfree(info);
-}
-#else
void gcov_info_free(struct gcov_info *info)
{
struct gcov_fn_info *fn, *tmp;
list_for_each_entry_safe(fn, tmp, &info->functions, head) {
- vfree(fn->counters);
+ kvfree(fn->counters);
list_del(&fn->head);
kfree(fn);
}
kfree(info->filename);
kfree(info);
}
-#endif
-
-#define ITER_STRIDE PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
- struct gcov_info *info;
- void *buffer;
- size_t size;
- loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
- *data = v;
- }
-
- return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- }
-
- return sizeof(*data) * 2;
-}
/**
* convert_to_gcda - convert profiling data set to gcda file format
@@ -521,7 +363,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
*
* Returns the number of bytes that were/would have been stored into the buffer.
*/
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
{
struct gcov_fn_info *fi_ptr;
size_t pos = 0;
@@ -535,21 +377,10 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
u32 i;
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
-#if CONFIG_CLANG_VERSION < 110000
- pos += store_gcov_u32(buffer, pos,
- fi_ptr->use_extra_checksum ? 3 : 2);
-#else
pos += store_gcov_u32(buffer, pos, 3);
-#endif
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
-#if CONFIG_CLANG_VERSION < 110000
- if (fi_ptr->use_extra_checksum)
- pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-#else
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-#endif
-
pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
for (i = 0; i < fi_ptr->num_counters; i++)
@@ -558,102 +389,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
return pos;
}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
- if (!iter)
- goto err_free;
-
- iter->info = info;
- /* Dry-run to get the actual buffer size. */
- iter->size = convert_to_gcda(NULL, info);
- iter->buffer = vmalloc(iter->size);
- if (!iter->buffer)
- goto err_free;
-
- convert_to_gcda(iter->buffer, info);
-
- return iter;
-
-err_free:
- kfree(iter);
- return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- vfree(iter->buffer);
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- if (iter->pos < iter->size)
- iter->pos += ITER_STRIDE;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- size_t len;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- len = ITER_STRIDE;
- if (iter->pos + len > iter->size)
- len = iter->size - iter->pos;
-
- seq_write(seq, iter->buffer + iter->pos, len);
-
- return 0;
-}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 82babf5aa077..5c3086cad8f9 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
#include "gcov.h"
/**
@@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str)
}
__setup("gcov_persist=", gcov_persist_setup);
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ size_t size;
+ loff_t pos;
+ char buffer[];
+};
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+static struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+ size_t size;
+
+ /* Dry-run to get the actual buffer size. */
+ size = convert_to_gcda(NULL, info);
+
+ iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->info = info;
+ iter->size = size;
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+}
+
+
+/**
+ * gcov_iter_free - free iterator data
+ * @iter: file iterator
+ */
+static void gcov_iter_free(struct gcov_iterator *iter)
+{
+ kvfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+static void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+static int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
+
/*
* seq_file.start() implementation for gcov data files. Note that the
* gcov_iterator interface is designed to be more restrictive than seq_file
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index c53408a00d0b..460c12b7dfea 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -15,8 +15,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "gcov.h"
#if (__GNUC__ >= 10)
@@ -310,7 +309,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
cv_size = sizeof(gcov_type) * sci_ptr->num;
- dci_ptr->values = vmalloc(cv_size);
+ dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL);
if (!dci_ptr->values)
goto err_free;
@@ -352,7 +351,7 @@ void gcov_info_free(struct gcov_info *info)
ci_ptr = info->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
- vfree(ci_ptr->values);
+ kvfree(ci_ptr->values);
kfree(info->functions[fi_idx]);
}
@@ -363,71 +362,6 @@ free_info:
kfree(info);
}
-#define ITER_STRIDE PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
- struct gcov_info *info;
- void *buffer;
- size_t size;
- loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
- *data = v;
- }
-
- return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- }
-
- return sizeof(*data) * 2;
-}
-
/**
* convert_to_gcda - convert profiling data set to gcda file format
* @buffer: the buffer to store file data or %NULL if no data should be stored
@@ -435,7 +369,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
*
* Returns the number of bytes that were/would have been stored into the buffer.
*/
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
{
struct gcov_fn_info *fi_ptr;
struct gcov_ctr_info *ci_ptr;
@@ -481,102 +415,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
return pos;
}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
- if (!iter)
- goto err_free;
-
- iter->info = info;
- /* Dry-run to get the actual buffer size. */
- iter->size = convert_to_gcda(NULL, info);
- iter->buffer = vmalloc(iter->size);
- if (!iter->buffer)
- goto err_free;
-
- convert_to_gcda(iter->buffer, info);
-
- return iter;
-
-err_free:
- kfree(iter);
- return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- vfree(iter->buffer);
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- if (iter->pos < iter->size)
- iter->pos += ITER_STRIDE;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- size_t len;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- len = ITER_STRIDE;
- if (iter->pos + len > iter->size)
- len = iter->size - iter->pos;
-
- seq_write(seq, iter->buffer + iter->pos, len);
-
- return 0;
-}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 6ab2c1808c9d..912b8ea01d33 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
+size_t convert_to_gcda(char *buffer, struct gcov_info *info);
/* Base interface. */
enum gcov_action {
@@ -58,16 +59,9 @@ enum gcov_action {
void gcov_event(enum gcov_action action, struct gcov_info *info);
void gcov_enable_events(void);
-/* Iterator control. */
-struct seq_file;
-struct gcov_iterator;
-
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
-void gcov_iter_free(struct gcov_iterator *iter);
-void gcov_iter_start(struct gcov_iterator *iter);
-int gcov_iter_next(struct gcov_iterator *iter);
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+/* writing helpers */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v);
+size_t store_gcov_u64(void *buffer, size_t off, u64 v);
/* gcov_info control. */
void gcov_info_reset(struct gcov_info *info);
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index c1510f0ab3ea..34a1dc2abc7d 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -36,7 +36,7 @@ all_dirs="$all_dirs $dir_list"
#
# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
# updated, but the contents might be still the same. When any CONFIG option is
-# changed, Kconfig touches the corresponding timestamp file include/config/*.h.
+# changed, Kconfig touches the corresponding timestamp file include/config/*.
# Hence, the md5sum detects the configuration change anyway. We do not need to
# check include/generated/autoconf.h explicitly.
#
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index a23ac2bbf433..f8f23af6ab0d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
irq_gc_unlock(gc);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_gc_set_wake);
static u32 irq_readl_be(void __iomem *addr)
{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f42ef868efd3..6284443b87ec 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -295,8 +295,8 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
/**
- * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
- * @of_node: pointer to interrupt controller's device tree node.
+ * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs
+ * @fwnode: firmware node for the interrupt controller
* @size: total number of irqs in mapping
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
@@ -312,15 +312,15 @@ EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
* irqs get mapped dynamically on the fly. However, if the controller requires
* static virq assignments (non-DT boot) then it will set that up correctly.
*/
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
- unsigned int size,
- unsigned int first_irq,
- const struct irq_domain_ops *ops,
- void *host_data)
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+ unsigned int size,
+ unsigned int first_irq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irq_domain *domain;
- domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
+ domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data);
if (!domain)
return NULL;
@@ -328,7 +328,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
/* attempt to allocated irq_descs */
int rc = irq_alloc_descs(first_irq, first_irq, size,
- of_node_to_nid(of_node));
+ of_node_to_nid(to_of_node(fwnode)));
if (rc < 0)
pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
first_irq);
@@ -338,7 +338,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
return domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_create_simple);
/**
* irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e8da1e71583a..23a7a0ba1388 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -19,7 +19,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <asm/processor.h>
-
+#include <linux/kasan.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
@@ -70,6 +70,9 @@ bool irq_work_queue(struct irq_work *work)
if (!irq_work_claim(work))
return false;
+ /*record irq_work call stack in order to print it in KASAN reports*/
+ kasan_record_aux_stack(work);
+
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
__irq_work_queue_local(work);
@@ -98,6 +101,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (!irq_work_claim(work))
return false;
+ kasan_record_aux_stack(work);
+
preempt_disable();
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index c1dd02f3be8b..e65de172ccf7 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -266,9 +266,10 @@ static const struct file_operations debugfs_ops =
.release = single_release
};
-static void __init kcsan_debugfs_init(void)
+static int __init kcsan_debugfs_init(void)
{
debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
+ return 0;
}
late_initcall(kcsan_debugfs_init);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index a0b6780740c8..f099baee3578 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -37,6 +37,7 @@
#include <linux/compiler.h>
#include <linux/hugetlb.h>
#include <linux/objtool.h>
+#include <linux/kmsg_dump.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -1165,7 +1166,7 @@ int kernel_kexec(void)
#endif
{
kexec_in_progress = true;
- kernel_restart_prepare(NULL);
+ kernel_restart_prepare("kexec reboot");
migrate_to_reboot_cpu();
/*
@@ -1179,6 +1180,7 @@ int kernel_kexec(void)
machine_shutdown();
}
+ kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_kexec(kexec_image);
#ifdef CONFIG_KEXEC_JUMP
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 5c3447cf7ad5..33400ff051a8 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -740,8 +740,10 @@ static int kexec_calculate_store_digests(struct kimage *image)
sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
+ if (!sha_regions) {
+ ret = -ENOMEM;
goto out_free_desc;
+ }
desc->tfm = tfm;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3cd075ce2a1e..b717134ebe17 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -58,7 +58,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
modprobe_path is set via /proc/sys.
*/
-char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
static void free_modprobe_argv(struct subprocess_info *info)
{
diff --git a/kernel/kthread.c b/kernel/kthread.c
index a1972eba2917..fe3f2a40d61e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -84,6 +84,25 @@ static inline struct kthread *to_kthread(struct task_struct *k)
return (__force void *)k->set_child_tid;
}
+/*
+ * Variant of to_kthread() that doesn't assume @p is a kthread.
+ *
+ * Per construction; when:
+ *
+ * (p->flags & PF_KTHREAD) && p->set_child_tid
+ *
+ * the task is both a kthread and struct kthread is persistent. However
+ * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
+ * begin_new_exec()).
+ */
+static inline struct kthread *__to_kthread(struct task_struct *p)
+{
+ void *kthread = (__force void *)p->set_child_tid;
+ if (kthread && !(p->flags & PF_KTHREAD))
+ kthread = NULL;
+ return kthread;
+}
+
void free_kthread_struct(struct task_struct *k)
{
struct kthread *kthread;
@@ -168,8 +187,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
*/
void *kthread_func(struct task_struct *task)
{
- if (task->flags & PF_KTHREAD)
- return to_kthread(task)->threadfn;
+ struct kthread *kthread = __to_kthread(task);
+ if (kthread)
+ return kthread->threadfn;
return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);
@@ -199,10 +219,11 @@ EXPORT_SYMBOL_GPL(kthread_data);
*/
void *kthread_probe_data(struct task_struct *task)
{
- struct kthread *kthread = to_kthread(task);
+ struct kthread *kthread = __to_kthread(task);
void *data = NULL;
- copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
+ if (kthread)
+ copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
@@ -514,9 +535,9 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
-bool kthread_is_per_cpu(struct task_struct *k)
+bool kthread_is_per_cpu(struct task_struct *p)
{
- struct kthread *kthread = to_kthread(k);
+ struct kthread *kthread = __to_kthread(p);
if (!kthread)
return false;
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index b94f3831e963..ec36b73f4733 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -66,12 +66,12 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
- if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
+ if (!(cnts = atomic_read(&lock->cnts)) &&
+ atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
goto unlock;
/* Set the waiting flag to notify readers that a writer is pending */
- atomic_add(_QW_WAITING, &lock->cnts);
+ atomic_or(_QW_WAITING, &lock->cnts);
/* When no more readers or writers, set the locked flag */
do {
diff --git a/kernel/module.c b/kernel/module.c
index 20fb004e7d8d..7e78dfabca97 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2401,6 +2401,15 @@ static long get_offset(struct module *mod, unsigned int *size,
return ret;
}
+static bool module_init_layout_section(const char *sname)
+{
+#ifndef CONFIG_MODULE_UNLOAD
+ if (module_exit_section(sname))
+ return true;
+#endif
+ return module_init_section(sname);
+}
+
/*
* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
* might -- code, read-only data, read-write data, small data. Tally
@@ -2435,7 +2444,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || module_init_section(sname))
+ || module_init_layout_section(sname))
continue;
s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
@@ -2468,7 +2477,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || !module_init_section(sname))
+ || !module_init_layout_section(sname))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
@@ -3121,11 +3130,6 @@ static int rewrite_section_headers(struct load_info *info, int flags)
*/
shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
-#ifndef CONFIG_MODULE_UNLOAD
- /* Don't load .exit sections */
- if (module_exit_section(info->secstrings+shdr->sh_name))
- shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
-#endif
}
/* Track but don't keep modinfo and version sections. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61db50f7ca86..2997ca600d18 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -31,6 +31,7 @@
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/minmax.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -169,6 +170,21 @@ void __ptrace_unlink(struct task_struct *child)
spin_unlock(&child->sighand->siglock);
}
+static bool looks_like_a_spurious_pid(struct task_struct *task)
+{
+ if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
+ return false;
+
+ if (task_pid_vnr(task) == task->ptrace_message)
+ return false;
+ /*
+ * The tracee changed its pid but the PTRACE_EVENT_EXEC event
+ * was not wait()'ed, most probably debugger targets the old
+ * leader which was destroyed in de_thread().
+ */
+ return true;
+}
+
/* Ensure that nothing can wake it up, even SIGKILL */
static bool ptrace_freeze_traced(struct task_struct *task)
{
@@ -179,7 +195,8 @@ static bool ptrace_freeze_traced(struct task_struct *task)
return ret;
spin_lock_irq(&task->sighand->siglock);
- if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
+ !__fatal_signal_pending(task)) {
task->state = __TASK_TRACED;
ret = true;
}
@@ -779,6 +796,24 @@ static int ptrace_peek_siginfo(struct task_struct *child,
return ret;
}
+#ifdef CONFIG_RSEQ
+static long ptrace_get_rseq_configuration(struct task_struct *task,
+ unsigned long size, void __user *data)
+{
+ struct ptrace_rseq_configuration conf = {
+ .rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
+ .rseq_abi_size = sizeof(*task->rseq),
+ .signature = task->rseq_sig,
+ .flags = 0,
+ };
+
+ size = min_t(unsigned long, size, sizeof(conf));
+ if (copy_to_user(data, &conf, size))
+ return -EFAULT;
+ return sizeof(conf);
+}
+#endif
+
#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
#else
@@ -1222,6 +1257,12 @@ int ptrace_request(struct task_struct *child, long request,
ret = seccomp_get_metadata(child, addr, datavp);
break;
+#ifdef CONFIG_RSEQ
+ case PTRACE_GET_RSEQ_CONFIGURATION:
+ ret = ptrace_get_rseq_configuration(child, addr, datavp);
+ break;
+#endif
+
default:
break;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 627e61b0c124..ca9f5198a01f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -64,12 +64,8 @@ static DEFINE_RWLOCK(resource_lock);
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p)
{
- /* Caller wants to traverse through siblings only */
- if (sibling_only)
- return p->sibling;
-
if (p->child)
return p->child;
while (!p->sibling && p->parent)
@@ -81,7 +77,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
(*pos)++;
- return (void *)next_resource(p, false);
+ return (void *)next_resource(p);
}
#ifdef CONFIG_PROC_FS
@@ -330,14 +326,10 @@ EXPORT_SYMBOL(release_resource);
* of the resource that's within [@start..@end]; if none is found, returns
* -ENODEV. Returns -EINVAL for invalid parameters.
*
- * This function walks the whole tree and not just first level children
- * unless @first_lvl is true.
- *
* @start: start address of the resource searched for
* @end: end address of same resource
* @flags: flags which the resource must have
* @desc: descriptor the resource must have
- * @first_lvl: walk only the first level children, if set
* @res: return ptr, if resource found
*
* The caller must specify @start, @end, @flags, and @desc
@@ -345,9 +337,8 @@ EXPORT_SYMBOL(release_resource);
*/
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
- bool first_lvl, struct resource *res)
+ struct resource *res)
{
- bool siblings_only = true;
struct resource *p;
if (!res)
@@ -358,7 +349,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
+ for (p = iomem_resource.child; p; p = next_resource(p)) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -369,13 +360,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
if (p->end < start)
continue;
- /*
- * Now that we found a range that matches what we look for,
- * check the flags and the descriptor. If we were not asked to
- * use only the first level, start looking at children as well.
- */
- siblings_only = first_lvl;
-
if ((p->flags & flags) != flags)
continue;
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
@@ -402,14 +386,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
- bool first_lvl, void *arg,
+ void *arg,
int (*func)(struct resource *, void *))
{
struct resource res;
int ret = -EINVAL;
while (start < end &&
- !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
+ !find_next_iomem_res(start, end, flags, desc, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
@@ -431,7 +415,6 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
* @arg: function argument for the callback @func
* @func: callback function that is called for each qualifying resource area
*
- * This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and
* desc are valid candidates.
*
@@ -441,7 +424,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *))
{
- return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
+ return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
@@ -457,8 +440,8 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
{
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
- arg, func);
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
}
/*
@@ -470,17 +453,14 @@ int walk_mem_res(u64 start, u64 end, void *arg,
{
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
- arg, func);
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
}
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* It is to be used only for System RAM.
- *
- * This will find System RAM ranges that are children of top-level resources
- * in addition to top-level System RAM resources.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -495,8 +475,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
- !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
- false, &res)) {
+ !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
pfn = PFN_UP(res.start);
end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
@@ -523,6 +502,34 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
+static int __region_intersects(resource_size_t start, size_t size,
+ unsigned long flags, unsigned long desc)
+{
+ struct resource res;
+ int type = 0; int other = 0;
+ struct resource *p;
+
+ res.start = start;
+ res.end = start + size - 1;
+
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ bool is_type = (((p->flags & flags) == flags) &&
+ ((desc == IORES_DESC_NONE) ||
+ (desc == p->desc)));
+
+ if (resource_overlaps(p, &res))
+ is_type ? type++ : other++;
+ }
+
+ if (type == 0)
+ return REGION_DISJOINT;
+
+ if (other == 0)
+ return REGION_INTERSECTS;
+
+ return REGION_MIXED;
+}
+
/**
* region_intersects() - determine intersection of region with known resources
* @start: region start address
@@ -546,31 +553,13 @@ EXPORT_SYMBOL_GPL(page_is_ram);
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
- struct resource res;
- int type = 0; int other = 0;
- struct resource *p;
-
- res.start = start;
- res.end = start + size - 1;
+ int ret;
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- bool is_type = (((p->flags & flags) == flags) &&
- ((desc == IORES_DESC_NONE) ||
- (desc == p->desc)));
-
- if (resource_overlaps(p, &res))
- is_type ? type++ : other++;
- }
+ ret = __region_intersects(start, size, flags, desc);
read_unlock(&resource_lock);
- if (type == 0)
- return REGION_DISJOINT;
-
- if (other == 0)
- return REGION_INTERSECTS;
-
- return REGION_MIXED;
+ return ret;
}
EXPORT_SYMBOL_GPL(region_intersects);
@@ -1171,31 +1160,16 @@ struct address_space *iomem_get_mapping(void)
return smp_load_acquire(&iomem_inode)->i_mapping;
}
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static int __request_region_locked(struct resource *res, struct resource *parent,
resource_size_t start, resource_size_t n,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = alloc_resource(GFP_KERNEL);
- struct resource *orig_parent = parent;
-
- if (!res)
- return NULL;
res->name = name;
res->start = start;
res->end = start + n - 1;
- write_lock(&resource_lock);
-
for (;;) {
struct resource *conflict;
@@ -1231,13 +1205,40 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- free_resource(res);
- res = NULL;
- break;
+ return -EBUSY;
}
+
+ return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name, int flags)
+{
+ struct resource *res = alloc_resource(GFP_KERNEL);
+ int ret;
+
+ if (!res)
+ return NULL;
+
+ write_lock(&resource_lock);
+ ret = __request_region_locked(res, parent, start, n, name, flags);
write_unlock(&resource_lock);
- if (res && orig_parent == &iomem_resource)
+ if (ret) {
+ free_resource(res);
+ return NULL;
+ }
+
+ if (parent == &iomem_resource)
revoke_iomem(res);
return res;
@@ -1779,25 +1780,56 @@ static struct resource *__request_free_mem_region(struct device *dev,
{
resource_size_t end, addr;
struct resource *res;
+ struct region_devres *dr = NULL;
size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
addr = end - size + 1UL;
+ res = alloc_resource(GFP_KERNEL);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev) {
+ dr = devres_alloc(devm_region_release,
+ sizeof(struct region_devres), GFP_KERNEL);
+ if (!dr) {
+ free_resource(res);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ write_lock(&resource_lock);
for (; addr > size && addr >= base->start; addr -= size) {
- if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+ if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
REGION_DISJOINT)
continue;
- if (dev)
- res = devm_request_mem_region(dev, addr, size, name);
- else
- res = request_mem_region(addr, size, name);
- if (!res)
- return ERR_PTR(-ENOMEM);
+ if (__request_region_locked(res, &iomem_resource, addr, size,
+ name, 0))
+ break;
+
+ if (dev) {
+ dr->parent = &iomem_resource;
+ dr->start = addr;
+ dr->n = size;
+ devres_add(dev, dr);
+ }
+
res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ write_unlock(&resource_lock);
+
+ /*
+ * A driver is claiming this region so revoke any mappings.
+ */
+ revoke_iomem(res);
return res;
}
+ write_unlock(&resource_lock);
+
+ free_resource(res);
+ if (dr)
+ devres_free(dr);
return ERR_PTR(-ERANGE);
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index a4f86a9d6937..35f7bd0fced0 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -84,13 +84,20 @@
static int rseq_update_cpu_id(struct task_struct *t)
{
u32 cpu_id = raw_smp_processor_id();
+ struct rseq __user *rseq = t->rseq;
- if (put_user(cpu_id, &t->rseq->cpu_id_start))
- return -EFAULT;
- if (put_user(cpu_id, &t->rseq->cpu_id))
- return -EFAULT;
+ if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ goto efault;
+ unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
+ unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ user_write_access_end();
trace_rseq_update(t);
return 0;
+
+efault_end:
+ user_write_access_end();
+efault:
+ return -EFAULT;
}
static int rseq_reset_rseq_cpu_id(struct task_struct *t)
@@ -120,8 +127,13 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
u32 sig;
int ret;
+#ifdef CONFIG_64BIT
+ if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
+ return -EFAULT;
+#else
if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
return -EFAULT;
+#endif
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
@@ -204,9 +216,13 @@ static int clear_rseq_cs(struct task_struct *t)
*
* Set rseq_cs to NULL.
*/
+#ifdef CONFIG_64BIT
+ return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+#else
if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
return -EFAULT;
return 0;
+#endif
}
/*
@@ -266,8 +282,6 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
- goto error;
ret = rseq_ip_fixup(regs);
if (unlikely(ret < 0))
goto error;
@@ -294,8 +308,7 @@ void rseq_syscall(struct pt_regs *regs)
if (!t->rseq)
return;
- if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
- rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
+ if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
force_sig(SIGSEGV);
}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 12bca64dff73..c2b2859ddd82 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -41,7 +41,7 @@
* Otherwise it tries to create a semi stable clock from a mixture of other
* clocks, including:
*
- * - GTOD (clock monotomic)
+ * - GTOD (clock monotonic)
* - sched_clock()
* - explicit idle events
*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 347127e73422..5226cc26a095 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * LATENCY_WARN is enabled).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ */
+__read_mostly int sysctl_resched_latency_warn_ms = 100;
+__read_mostly int sysctl_resched_latency_warn_once = 1;
+#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
@@ -737,7 +747,7 @@ static void nohz_csd_func(void *info)
/*
* Release the rq::nohz_csd.
*/
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
@@ -928,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
- return clamp_value / UCLAMP_BUCKET_DELTA;
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
@@ -1811,7 +1821,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
return cpu_online(cpu);
/* Regular kernel threads don't get to stay during offline. */
- if (cpu_rq(cpu)->balance_push)
+ if (cpu_dying(cpu))
return false;
/* But are allowed during online. */
@@ -1927,6 +1937,12 @@ static int migration_cpu_stop(void *data)
rq_lock(rq, &rf);
/*
+ * If we were passed a pending, then ->stop_pending was set, thus
+ * p->migration_pending must have remained stable.
+ */
+ WARN_ON_ONCE(pending && pending != p->migration_pending);
+
+ /*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
@@ -1936,8 +1952,7 @@ static int migration_cpu_stop(void *data)
goto out;
if (pending) {
- if (p->migration_pending == pending)
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
complete = true;
}
@@ -1976,8 +1991,7 @@ static int migration_cpu_stop(void *data)
* somewhere allowed, we're done.
*/
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- if (p->migration_pending == pending)
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
complete = true;
goto out;
}
@@ -2165,16 +2179,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
*
* (1) In the cases covered above. There is one more where the completion is
* signaled within affine_move_task() itself: when a subsequent affinity request
- * cancels the need for an active migration. Consider:
+ * occurs after the stopper bailed out due to the targeted task still being
+ * Migrate-Disable. Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
- * P0@CPU0 P1 P2
- *
- * migrate_disable();
- * <preempted>
+ * CPU0 P1 P2
+ * <P0>
+ * migrate_disable();
+ * <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
+ * <migration/0>
+ * migration_cpu_stop()
+ * is_migration_disabled()
+ * <bails>
* set_cpus_allowed_ptr(P0, [0, 1]);
* <signal completion>
* <awakes>
@@ -4244,8 +4263,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq;
-
/*
* New tasks start with FORK_PREEMPT_COUNT, see there and
* finish_task_switch() for details.
@@ -4255,7 +4272,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* PREEMPT_COUNT kernels).
*/
- rq = finish_task_switch(prev);
+ finish_task_switch(prev);
preempt_enable();
if (current->set_child_tid)
@@ -4520,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_SCHED_DEBUG
+static u64 cpu_resched_latency(struct rq *rq)
+{
+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+ u64 resched_latency, now = rq_clock(rq);
+ static bool warned_once;
+
+ if (sysctl_resched_latency_warn_once && warned_once)
+ return 0;
+
+ if (!need_resched() || !latency_warn_ms)
+ return 0;
+
+ if (system_state == SYSTEM_BOOTING)
+ return 0;
+
+ if (!rq->last_seen_need_resched_ns) {
+ rq->last_seen_need_resched_ns = now;
+ rq->ticks_without_resched = 0;
+ return 0;
+ }
+
+ rq->ticks_without_resched++;
+ resched_latency = now - rq->last_seen_need_resched_ns;
+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+ return 0;
+
+ warned_once = true;
+
+ return resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+ long val;
+
+ if ((kstrtol(str, 0, &val))) {
+ pr_warn("Unable to set resched_latency_warn_ms\n");
+ return 1;
+ }
+
+ sysctl_resched_latency_warn_ms = val;
+ return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -4531,6 +4597,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
+ u64 resched_latency;
arch_scale_freq_tick();
sched_clock_tick();
@@ -4541,11 +4608,15 @@ void scheduler_tick(void)
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
+ if (sched_feat(LATENCY_WARN))
+ resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
- psi_task_tick(rq);
rq_unlock(rq, &rf);
+ if (sched_feat(LATENCY_WARN) && resched_latency)
+ resched_latency_warn(cpu, resched_latency);
+
perf_event_task_tick();
#ifdef CONFIG_SMP
@@ -5040,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+ rq->last_seen_need_resched_ns = 0;
+#endif
if (likely(prev != next)) {
rq->nr_switches++;
@@ -5365,23 +5439,23 @@ enum {
preempt_dynamic_full,
};
-static int preempt_dynamic_mode = preempt_dynamic_full;
+int preempt_dynamic_mode = preempt_dynamic_full;
-static int sched_dynamic_mode(const char *str)
+int sched_dynamic_mode(const char *str)
{
if (!strcmp(str, "none"))
- return 0;
+ return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
- return 1;
+ return preempt_dynamic_voluntary;
if (!strcmp(str, "full"))
- return 2;
+ return preempt_dynamic_full;
- return -1;
+ return -EINVAL;
}
-static void sched_dynamic_update(int mode)
+void sched_dynamic_update(int mode)
{
/*
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
@@ -5438,77 +5512,8 @@ static int __init setup_preempt_mode(char *str)
}
__setup("preempt=", setup_preempt_mode);
-#ifdef CONFIG_SCHED_DEBUG
-
-static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[16];
- int mode;
-
- if (cnt > 15)
- cnt = 15;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- mode = sched_dynamic_mode(strstrip(buf));
- if (mode < 0)
- return mode;
-
- sched_dynamic_update(mode);
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static int sched_dynamic_show(struct seq_file *m, void *v)
-{
- static const char * preempt_modes[] = {
- "none", "voluntary", "full"
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
- if (preempt_dynamic_mode == i)
- seq_puts(m, "(");
- seq_puts(m, preempt_modes[i]);
- if (preempt_dynamic_mode == i)
- seq_puts(m, ")");
-
- seq_puts(m, " ");
- }
-
- seq_puts(m, "\n");
- return 0;
-}
-
-static int sched_dynamic_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, sched_dynamic_show, NULL);
-}
-
-static const struct file_operations sched_dynamic_fops = {
- .open = sched_dynamic_open,
- .write = sched_dynamic_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static __init int sched_init_debug_dynamic(void)
-{
- debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
- return 0;
-}
-late_initcall(sched_init_debug_dynamic);
-
-#endif /* CONFIG_SCHED_DEBUG */
#endif /* CONFIG_PREEMPT_DYNAMIC */
-
/*
* This is the entry point to schedule() from kernel preemption
* off of irq context.
@@ -7633,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
/*
* Ensure we only run per-cpu kthreads once the CPU goes !active.
+ *
+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
+ * effective when the hotplug motion is down.
*/
static void balance_push(struct rq *rq)
{
@@ -7640,12 +7648,19 @@ static void balance_push(struct rq *rq)
lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
*/
rq->balance_callback = &balance_push_callback;
/*
+ * Only active while going offline.
+ */
+ if (!cpu_dying(rq->cpu))
+ return;
+
+ /*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*
@@ -7653,7 +7668,7 @@ static void balance_push(struct rq *rq)
* histerical raisins.
*/
if (rq->idle == push_task ||
- ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
+ kthread_is_per_cpu(push_task) ||
is_migration_disabled(push_task)) {
/*
@@ -7698,7 +7713,6 @@ static void balance_push_set(int cpu, bool on)
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
- rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
@@ -7823,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq_flags rf;
/*
- * Make sure that when the hotplug state machine does a roll-back
- * we clear balance_push. Ideally that would happen earlier...
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
*/
balance_push_set(cpu, false);
@@ -8009,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu)
}
rq_unlock_irqrestore(rq, &rf);
- /*
- * Now that the CPU is offline, make sure we're welcome
- * to new tasks once we come back up.
- */
- balance_push_set(cpu, false);
-
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
@@ -8199,7 +8207,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->balance_callback = NULL;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -8246,6 +8254,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+ balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
@@ -8970,7 +8979,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
- * Likewise, bound things on the otherside by preventing insane quota
+ * Likewise, bound things on the other side by preventing insane quota
* periods. This also allows us to normalize in computing quota
* feasibility.
*/
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 941c28cf9738..104a1bade14f 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -104,7 +104,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
- * the sum of suages.
+ * the sum of usages.
*/
BUG_ON(index > CPUACCT_STAT_NSTATS);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6ee9c9bbe505..4f09afd2f321 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -466,7 +466,7 @@ static void sugov_work(struct kthread_work *work)
/*
* Hold sg_policy->update_lock shortly to handle the case where:
- * incase sg_policy->next_freq is read here, and then updated by
+ * in case sg_policy->next_freq is read here, and then updated by
* sugov_deferred_update() just before work_in_progress is set to false
* here, we may miss queueing the new update.
*
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index ec9be789c7e2..d583f2aa744e 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -77,7 +77,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
* When looking at the vector, we need to read the counter,
* do a memory barrier, then read the mask.
*
- * Note: This is still all racey, but we can deal with it.
+ * Note: This is still all racy, but we can deal with it.
* Ideally, we only want to look at masks that are set.
*
* If a mask is not set, then the only thing wrong is that we
@@ -186,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* The cost of this trade-off is not entirely clear and will probably
* be good for some workloads and bad for others.
*
- * The main idea here is that if some CPUs were overcommitted, we try
+ * The main idea here is that if some CPUs were over-committed, we try
* to spread which is what the scheduler traditionally did. Sys admins
* must do proper RT planning to avoid overloading the system if they
* really care.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2c36a5fad589..872e481d5098 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -563,7 +563,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
/*
* If either stime or utime are 0, assume all runtime is userspace.
- * Once a task gets some ticks, the monotonicy code at 'update:'
+ * Once a task gets some ticks, the monotonicity code at 'update:'
* will ensure things converge to the observed ratio.
*/
if (stime == 0) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aac3539aa0fe..9a2989749b8d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -245,7 +245,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
@@ -267,7 +267,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* fires.
*
* If the task wakes up again before the inactive timer fires,
- * the timer is cancelled, whereas if the task wakes up after the
+ * the timer is canceled, whereas if the task wakes up after the
* inactive timer fired (and running_bw has been decreased) the
* task's utilization has to be added to running_bw again.
* A flag in the deadline scheduling entity (dl_non_contending)
@@ -385,7 +385,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
dl_se->dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
@@ -1206,7 +1206,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
* multiplied by 2^BW_SHIFT, the result has to be shifted right by
* BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
* dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
* Since delta is a 64 bit variable, to have an overflow its value
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
@@ -1737,7 +1737,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
@@ -2745,7 +2745,7 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
/*
* Default limits for DL period; on the top end we guard against small util
- * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
* guard against timer DoS.
*/
unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 486f403a778b..9c882f20803e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,8 +8,6 @@
*/
#include "sched.h"
-static DEFINE_SPINLOCK(sched_debug_lock);
-
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -169,245 +167,258 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
-__read_mostly bool sched_debug_enabled;
+#ifdef CONFIG_SMP
-static __init int sched_init_debug(void)
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
+ char buf[16];
- debugfs_create_bool("sched_debug", 0644, NULL,
- &sched_debug_enabled);
+ if (cnt > 15)
+ cnt = 15;
- return 0;
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ return -EINVAL;
+
+ if (sched_update_scaling())
+ return -EINVAL;
+
+ *ppos += cnt;
+ return cnt;
}
-late_initcall(sched_init_debug);
-#ifdef CONFIG_SMP
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+ return 0;
+}
-#ifdef CONFIG_SYSCTL
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_scaling_show, NULL);
+}
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
+static const struct file_operations sched_scaling_fops = {
+ .open = sched_scaling_open,
+ .write = sched_scaling_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
};
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
-};
+#endif /* SMP */
-static struct ctl_table *sd_alloc_ctl_entry(int n)
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
- return entry;
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
}
-static void sd_free_ctl_entry(struct ctl_table **tablep)
+static int sched_dynamic_show(struct seq_file *m, void *v)
{
- struct ctl_table *entry;
-
- /*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
- */
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full"
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
}
- kfree(*tablep);
- *tablep = NULL;
+ seq_puts(m, "\n");
+ return 0;
}
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler)
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
+ return single_open(filp, sched_dynamic_show, NULL);
}
-static int sd_ctl_doflags(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+__read_mostly bool sched_debug_verbose;
+
+static const struct seq_operations sched_debug_sops;
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
{
- unsigned long flags = *(unsigned long *)table->data;
- size_t data_size = 0;
- size_t len = 0;
- char *tmp, *buf;
- int idx;
+ return seq_open(filp, &sched_debug_sops);
+}
- if (write)
- return 0;
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
- for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
- char *name = sd_flag_debug[idx].name;
+static struct dentry *debugfs_sched;
- /* Name plus whitespace */
- data_size += strlen(name) + 1;
- }
+static __init int sched_init_debug(void)
+{
+ struct dentry __maybe_unused *numa;
- if (*ppos > data_size) {
- *lenp = 0;
- return 0;
- }
+ debugfs_sched = debugfs_create_dir("sched", NULL);
- buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+ debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+ debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+#endif
- for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
- char *name = sd_flag_debug[idx].name;
+ debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+ debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+ debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
- len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
- }
+ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
- tmp = buf + *ppos;
- len -= *ppos;
+#ifdef CONFIG_SMP
+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
- if (len > *lenp)
- len = *lenp;
- if (len)
- memcpy(buffer, tmp, len);
- if (len < *lenp) {
- ((char *)buffer)[len] = '\n';
- len++;
- }
+ mutex_lock(&sched_domains_mutex);
+ update_sched_domain_debugfs();
+ mutex_unlock(&sched_domains_mutex);
+#endif
- *lenp = len;
- *ppos += len;
+#ifdef CONFIG_NUMA_BALANCING
+ numa = debugfs_create_dir("numa_balancing", debugfs_sched);
- kfree(buf);
+ debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+ debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+ debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+ debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+#endif
+
+ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
return 0;
}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
+static cpumask_var_t sd_sysctl_cpus;
+static struct dentry *sd_dentry;
+
+static int sd_flags_show(struct seq_file *m, void *v)
{
- struct ctl_table *table = sd_alloc_ctl_entry(9);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
- set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
- /* &table[8] is terminator */
-
- return table;
+ unsigned long flags = *(unsigned int *)m->private;
+ int idx;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ seq_puts(m, sd_flag_debug[idx].name);
+ seq_puts(m, " ");
+ }
+ seq_puts(m, "\n");
+
+ return 0;
}
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static int sd_flags_open(struct inode *inode, struct file *file)
{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
-
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
- }
- return table;
+ return single_open(file, sd_flags_show, inode->i_private);
}
-static cpumask_var_t sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static const struct file_operations sd_flags_fops = {
+ .open = sd_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
-void register_sched_domain_sysctl(void)
+static void register_sd(struct sched_domain *sd, struct dentry *parent)
{
- static struct ctl_table *cpu_entries;
- static struct ctl_table **cpu_idx;
- static bool init_done = false;
- char buf[32];
- int i;
+#define SDM(type, mode, member) \
+ debugfs_create_##type(#member, mode, parent, &sd->member)
- if (!cpu_entries) {
- cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
- if (!cpu_entries)
- return;
+ SDM(ulong, 0644, min_interval);
+ SDM(ulong, 0644, max_interval);
+ SDM(u64, 0644, max_newidle_lb_cost);
+ SDM(u32, 0644, busy_factor);
+ SDM(u32, 0644, imbalance_pct);
+ SDM(u32, 0644, cache_nice_tries);
+ SDM(str, 0444, name);
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = cpu_entries;
- }
+#undef SDM
- if (!cpu_idx) {
- struct ctl_table *e = cpu_entries;
-
- cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
- if (!cpu_idx)
- return;
+ debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+}
- /* deal with sparse possible map */
- for_each_possible_cpu(i) {
- cpu_idx[i] = e;
- e++;
- }
- }
+void update_sched_domain_debugfs(void)
+{
+ int cpu, i;
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
- }
-
- if (!init_done) {
- init_done = true;
- /* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
- for_each_cpu(i, sd_sysctl_cpus) {
- struct ctl_table *e = cpu_idx[i];
+ if (!sd_dentry)
+ sd_dentry = debugfs_create_dir("domains", debugfs_sched);
- if (e->child)
- sd_free_ctl_entry(&e->child);
+ for_each_cpu(cpu, sd_sysctl_cpus) {
+ struct sched_domain *sd;
+ struct dentry *d_cpu;
+ char buf[32];
- if (!e->procname) {
- snprintf(buf, 32, "cpu%d", i);
- e->procname = kstrdup(buf, GFP_KERNEL);
+ snprintf(buf, sizeof(buf), "cpu%d", cpu);
+ debugfs_remove(debugfs_lookup(buf, sd_dentry));
+ d_cpu = debugfs_create_dir(buf, sd_dentry);
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ struct dentry *d_sd;
+
+ snprintf(buf, sizeof(buf), "domain%d", i);
+ d_sd = debugfs_create_dir(buf, d_cpu);
+
+ register_sd(sd, d_sd);
+ i++;
}
- e->mode = 0555;
- e->child = sd_alloc_ctl_cpu_table(i);
- __cpumask_clear_cpu(i, sd_sysctl_cpus);
+ __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
}
-
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
void dirty_sched_domain_sysctl(int cpu)
@@ -416,13 +427,6 @@ void dirty_sched_domain_sysctl(int cpu)
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
-/* may be called multiple times per register */
-void unregister_sched_domain_sysctl(void)
-{
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
-}
-#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -470,16 +474,37 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
static char group_path[PATH_MAX];
-static char *task_group_path(struct task_group *tg)
+static void task_group_path(struct task_group *tg, char *path, int plen)
{
- if (autogroup_path(tg, group_path, PATH_MAX))
- return group_path;
+ if (autogroup_path(tg, path, plen))
+ return;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, path, plen);
+}
- return group_path;
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ if (spin_trylock(&sched_debug_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock(&sched_debug_lock); \
+ } else { \
+ char buf[128]; \
+ char *bufend = buf + sizeof(buf) - 3; \
+ task_group_path(tg, buf, bufend - buf); \
+ strcpy(bufend - 1, "..."); \
+ SEQ_printf(m, fmt, buf); \
+ } \
}
#endif
@@ -506,7 +531,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf(m, " %s", task_group_path(task_group(p)));
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -543,7 +568,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
@@ -614,7 +639,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "rt_rq[%d]:\n", cpu);
@@ -666,7 +691,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
#ifdef CONFIG_X86
{
@@ -717,13 +741,11 @@ do { \
}
#undef P
- spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
- spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
@@ -815,7 +837,7 @@ void sysrq_sched_debug_show(void)
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is CPU 0.
* In a hotplugged system some CPUs, including CPU 0, may be missing so we have
@@ -860,15 +882,6 @@ static const struct seq_operations sched_debug_sops = {
.show = sched_debug_show,
};
-static int __init init_sched_debug_procfs(void)
-{
- if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
- return -ENOMEM;
- return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
#define __P(F) __PS(#F, F)
#define P(F) __PS(#F, p->F)
@@ -1033,3 +1046,13 @@ void proc_sched_set_task(struct task_struct *p)
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
}
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+ static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+ WARN(__ratelimit(&latency_check_ratelimit),
+ "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+ "without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..3248e24a90b0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
@@ -113,6 +113,13 @@ int __weak arch_asym_cpu_priority(int cpu)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+/*
+ * The margin used when comparing CPU capacities.
+ * is 'cap1' noticeably greater than 'cap2'
+ *
+ * (default: ~5%)
+ */
+#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -229,22 +236,25 @@ static void __update_inv_weight(struct load_weight *lw)
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
+ u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
+ int fs;
__update_inv_weight(lw);
- if (unlikely(fact >> 32)) {
- while (fact >> 32) {
- fact >>= 1;
- shift--;
- }
+ if (unlikely(fact_hi)) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
fact = mul_u32_u32(fact, lw->inv_weight);
- while (fact >> 32) {
- fact >>= 1;
- shift--;
+ fact_hi = (u32)(fact >> 32);
+ if (fact_hi) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
@@ -624,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods:
*/
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
- if (ret || !write)
- return ret;
-
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
@@ -682,7 +687,13 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+ unsigned int nr_running = cfs_rq->nr_running;
+ u64 slice;
+
+ if (sched_feat(ALT_PERIOD))
+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+
+ slice = __sched_period(nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
@@ -699,6 +710,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
slice = __calc_delta(slice, se->load.weight, load);
}
+
+ if (sched_feat(BASE_SLICE))
+ slice = max(slice, (u64)sysctl_sched_min_granularity);
+
return slice;
}
@@ -1122,7 +1137,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
return rss / nr_scan_pages;
}
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
@@ -2574,7 +2589,7 @@ no_join:
}
/*
- * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * Get rid of NUMA statistics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
@@ -3941,13 +3956,15 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
*
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
*
- * NOTE: this only works when value + maring < INT_MAX.
+ * NOTE: this only works when value + margin < INT_MAX.
*/
static inline bool within_margin(int value, int margin)
{
@@ -3958,7 +3975,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
- long last_ewma_diff;
+ long last_ewma_diff, last_enqueued_diff;
struct util_est ue;
if (!sched_feat(UTIL_EST))
@@ -3979,6 +3996,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
+ last_enqueued_diff = ue.enqueued;
+
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
@@ -3992,12 +4011,17 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
}
/*
- * Skip update of task's estimated utilization when its EWMA is
+ * Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
last_ewma_diff = ue.enqueued - ue.ewma;
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ last_enqueued_diff -= ue.enqueued;
+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
+ goto done;
+
return;
+ }
/*
* To avoid overestimation of actual task utilization, skip updates if
@@ -4244,7 +4268,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When bandwidth control is enabled, cfs might have been removed
* because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionnally.
+ * add it unconditionally.
*/
if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
@@ -5299,7 +5323,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* bits doesn't do much.
*/
-/* cpu online calback */
+/* cpu online callback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
@@ -6098,6 +6122,24 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
return -1;
}
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
#else /* CONFIG_SCHED_SMT */
static inline void set_idle_cores(int cpu, int val)
@@ -6114,6 +6156,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core);
}
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
#endif /* CONFIG_SCHED_SMT */
/*
@@ -6121,11 +6168,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- bool smt = test_idle_cores(target, false);
int this = smp_processor_id();
struct sched_domain *this_sd;
u64 time;
@@ -6136,7 +6182,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_PROP) && !smt) {
+ if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
/*
@@ -6156,7 +6202,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
for_each_cpu_wrap(cpu, cpus, target) {
- if (smt) {
+ if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
return i;
@@ -6170,10 +6216,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
}
- if (smt)
- set_idle_cores(this, false);
+ if (has_idle_core)
+ set_idle_cores(target, false);
- if (sched_feat(SIS_PROP) && !smt) {
+ if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
update_avg(&this_sd->avg_scan_cost, time);
}
@@ -6228,6 +6274,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu)
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
+ bool has_idle_core = false;
struct sched_domain *sd;
unsigned long task_util;
int i, recent_used_cpu;
@@ -6307,7 +6354,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;
- i = select_idle_cpu(p, sd, target);
+ if (sched_smt_active()) {
+ has_idle_core = test_idle_cores(target, false);
+
+ if (!has_idle_core && cpus_share_cache(prev, target)) {
+ i = select_idle_smt(p, sd, prev);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ }
+ }
+
+ i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6471,7 +6528,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
- sub_positive(&util, task_util(p));
+ lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
@@ -6518,8 +6575,24 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* its pd list and will not be accounted by compute_energy().
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
- struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
+ unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
+ unsigned long cpu_util, util_running = util_freq;
+ struct task_struct *tsk = NULL;
+
+ /*
+ * When @p is placed on @cpu:
+ *
+ * util_running = max(cpu_util, cpu_util_est) +
+ * max(task_util, _task_util_est)
+ *
+ * while cpu_util_next is: max(cpu_util + task_util,
+ * cpu_util_est + _task_util_est)
+ */
+ if (cpu == dst_cpu) {
+ tsk = p;
+ util_running =
+ cpu_util_next(cpu, p, -1) + task_util_est(p);
+ }
/*
* Busy time computation: utilization clamping is not
@@ -6527,7 +6600,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
+ sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
ENERGY_UTIL, NULL);
/*
@@ -6537,7 +6610,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
+ cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, cpu_util);
}
@@ -6935,7 +7008,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
+ * unconditionally check_preempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
@@ -7392,8 +7465,7 @@ enum migration_type {
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
+#define LBF_ACTIVE_LB 0x10
struct lb_env {
struct sched_domain *sd;
@@ -7539,6 +7611,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /* Disregard pcpu kthreads; they are where they need to be. */
+ if (kthread_is_per_cpu(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -7551,10 +7627,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
- * already computed one in current iteration.
+ * Avoid computing new_dst_cpu
+ * - for NEWLY_IDLE
+ * - if we have already computed one in current iteration
+ * - if it's an active balance
*/
- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE ||
+ env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
@@ -7569,7 +7648,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
}
- /* Record that we found atleast one task that could run on dst_cpu */
+ /* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
@@ -7579,10 +7658,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) active balance
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
+ if (env->flags & LBF_ACTIVE_LB)
+ return 1;
+
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
@@ -7659,6 +7742,15 @@ static int detach_tasks(struct lb_env *env)
lockdep_assert_held(&env->src_rq->lock);
+ /*
+ * Source run queue has been emptied by another CPU, clear
+ * LBF_ALL_PINNED flag as we will not test any task.
+ */
+ if (env->src_rq->nr_running <= 1) {
+ env->flags &= ~LBF_ALL_PINNED;
+ return 0;
+ }
+
if (env->imbalance <= 0)
return 0;
@@ -7708,8 +7800,7 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
-
- if ((load >> env->sd->nr_balance_failed) > env->imbalance)
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@@ -7854,16 +7945,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
{
- rq->last_blocked_load_update_tick = jiffies;
+ WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
}
#else
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
@@ -8024,6 +8119,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
+ update_blocked_load_tick(rq);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
@@ -8311,26 +8407,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
return false;
}
-/*
- * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity than sched_group ref.
- */
-static inline bool
-group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
-}
-
-/*
- * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity_orig than sched_group ref.
- */
-static inline bool
-group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
-}
-
static inline enum
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
@@ -8354,28 +8430,6 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
}
-static bool update_nohz_stats(struct rq *rq, bool force)
-{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned int cpu = rq->cpu;
-
- if (!rq->has_blocked_load)
- return false;
-
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
- return false;
-
- if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
- return true;
-
- update_blocked_averages(cpu);
-
- return rq->has_blocked_load;
-#else
- return false;
-#endif
-}
-
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8397,9 +8451,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
- env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8489,7 +8540,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* internally or be covered by avg_load imbalance (eventually).
*/
if (sgs->group_type == group_misfit_task &&
- (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -8573,7 +8624,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
- (group_smaller_min_cpu_capacity(sds->local, sg)))
+ (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
return false;
return true;
@@ -8940,11 +8991,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
-#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
- env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8981,14 +9027,6 @@ next_group:
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
-#ifdef CONFIG_NO_HZ_COMMON
- if ((env->flags & LBF_NOHZ_AGAIN) &&
- cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
- WRITE_ONCE(nohz.next_blocked,
- jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
- }
-#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -9386,7 +9424,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- capacity_of(env->dst_cpu) < capacity &&
+ !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
nr_running == 1)
continue;
@@ -9676,7 +9714,7 @@ more_balance:
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
- * given_cpu) causing exceess load to be moved to given_cpu.
+ * given_cpu) causing excess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
@@ -9776,9 +9814,6 @@ more_balance:
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
-
- /* We've kicked active balancing, force task migration. */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else {
sd->nr_balance_failed = 0;
@@ -9820,7 +9855,7 @@ out_one_pinned:
/*
* newidle_balance() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
- * skyrocketting in a short amount of time. Skip the balance_interval
+ * skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
@@ -9928,13 +9963,7 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
- /*
- * can_migrate_task() doesn't need to compute new_dst_cpu
- * for active balancing. Since we have CPU_IDLE, but no
- * @dst_grpmask we need to make that test go away with lying
- * about DST_PINNED.
- */
- .flags = LBF_DST_PINNED,
+ .flags = LBF_ACTIVE_LB,
};
schedstat_inc(sd->alb_count);
@@ -10061,22 +10090,9 @@ out:
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
- if (likely(update_next_balance)) {
+ if (likely(update_next_balance))
rq->next_balance = next_balance;
-#ifdef CONFIG_NO_HZ_COMMON
- /*
- * If this CPU has been elected to perform the nohz idle
- * balance. Other idle CPUs have already rebalanced with
- * nohz_idle_balance() and nohz.next_balance has been
- * updated accordingly. This CPU is now running the idle load
- * balance for itself and we need to update the
- * nohz.next_balance accordingly.
- */
- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
- nohz.next_balance = rq->next_balance;
-#endif
- }
}
static inline int on_null_domain(struct rq *rq)
@@ -10368,14 +10384,30 @@ out:
WRITE_ONCE(nohz.has_blocked, 1);
}
+static bool update_nohz_stats(struct rq *rq)
+{
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
+ return true;
+
+ update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+}
+
/*
* Internal function that runs load balance for all idle cpus. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
*/
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
@@ -10385,7 +10417,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
- int ret = false;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10406,8 +10437,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
*/
smp_mb();
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ /*
+ * Start with the next CPU after this_cpu so we will end with this_cpu and let a
+ * chance for other idle cpu to pull load.
+ */
+ for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
+ if (!idle_cpu(balance_cpu))
continue;
/*
@@ -10422,7 +10457,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
rq = cpu_rq(balance_cpu);
- has_blocked_load |= update_nohz_stats(rq, true);
+ has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
@@ -10453,27 +10488,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
- /* Newly idle CPU doesn't need an update */
- if (idle != CPU_NEWLY_IDLE) {
- update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
- }
-
- if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
- /* The full idle balance loop has been done */
- ret = true;
-
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
-
- return ret;
}
/*
@@ -10497,6 +10518,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return true;
}
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+ unsigned int flags;
+
+ flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+ /*
+ * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+ * (ie NOHZ_STATS_KICK set) and will do the same.
+ */
+ if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+}
+
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
@@ -10517,16 +10556,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
- raw_spin_unlock(&this_rq->lock);
/*
- * This CPU is going to be idle and blocked load of idle CPUs
- * need to be updated. Run the ilb locally as it is a good
- * candidate for ilb instead of waking up another idle CPU.
- * Kick an normal ilb if we failed to do the update.
+ * Set the need to trigger ILB in order to update blocked load
+ * before entering idle state.
*/
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
- kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10621,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
- nohz_newidle_balance(this_rq);
-
goto out;
}
@@ -10635,7 +10667,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
-out:
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
@@ -10644,16 +10675,19 @@ out:
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
+ else
+ nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
@@ -10844,16 +10878,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq;
+ list_add_leaf_cfs_rq(cfs_rq_of(se));
+
/* Start to propagate at parent */
se = se->parent;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- break;
+ if (!cfs_rq_throttled(cfs_rq)){
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ list_add_leaf_cfs_rq(cfs_rq);
+ continue;
+ }
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
}
}
#else
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1bc2b158fc51..7f8dace0964c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -27,7 +27,7 @@ SCHED_FEAT(NEXT_BUDDY, false)
SCHED_FEAT(LAST_BUDDY, true)
/*
- * Consider buddies to be cache hot, decreases the likelyness of a
+ * Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
@@ -90,3 +90,8 @@ SCHED_FEAT(WA_BIAS, true)
*/
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(LATENCY_WARN, false)
+
+SCHED_FEAT(ALT_PERIOD, true)
+SCHED_FEAT(BASE_SLICE, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7199e6f23789..7ca3d3d86c2a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -163,7 +163,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*
* NOTE: no locks or semaphores should be used here
*
- * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * On architectures that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
@@ -199,7 +199,7 @@ static void cpuidle_idle_call(void)
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in interrupts (if any). In that case bypass
- * the cpuidle governor and go stratight for the deepest idle state
+ * the cpuidle governor and go straight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
@@ -261,6 +261,12 @@ exit_idle:
static void do_idle(void)
{
int cpu = smp_processor_id();
+
+ /*
+ * Check if we need to update blocked load
+ */
+ nohz_run_idle_balance(cpu);
+
/*
* If the arch has a polling bit, we maintain an invariant:
*
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index d2a655643a02..1c79896f1bc0 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -189,7 +189,7 @@ calc_load_n(unsigned long load, unsigned long exp,
* w:0 1 1 0 0 1 1 0 0
*
* This ensures we'll fold the old NO_HZ contribution in this window while
- * accumlating the new one.
+ * accumulating the new one.
*
* - When we wake up from NO_HZ during the window, we push up our
* contribution, since we effectively move our sample point to a known
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 2c613e1cff3a..a554e3bbab2b 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -133,7 +133,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* runnable = running = 0;
*
* clause from ___update_load_sum(); this results in
- * the below usage of @contrib to dissapear entirely,
+ * the below usage of @contrib to disappear entirely,
* so no point in calculating it.
*/
contrib = __accumulate_pelt_segments(periods,
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 795e43e02afc..1462846d244e 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -130,7 +130,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
* Reflecting stolen time makes sense only if the idle
* phase would be present at max capacity. As soon as the
* utilization of a rq has reached the maximum value, it is
- * considered as an always runnig rq without idle time to
+ * considered as an always running rq without idle time to
* steal. This potential idle time is considered as lost in
* this case. We keep track of this lost idle time compare to
* rq's clock_task.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 967732c0766c..cc25a3cff41f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -34,7 +34,10 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level, means all non-idle tasks
+ * in a cgroup are delayed on the CPU resource which used by others outside
+ * of the cgroup or throttled by the cgroup cpu.max configuration.
*
* SOME = nr_delayed_tasks != 0
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
@@ -59,7 +62,7 @@
* states, we would have to conclude a CPU SOME pressure number of
* 100%, since *somebody* is waiting on a runqueue at all
* times. However, that is clearly not the amount of contention the
- * workload is experiencing: only one out of 256 possible exceution
+ * workload is experiencing: only one out of 256 possible execution
* threads will be contended at any given time, or about 0.4%.
*
* Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
@@ -73,7 +76,7 @@
* we have to base our calculation on the number of non-idle tasks in
* conjunction with the number of available CPUs, which is the number
* of potential execution threads. SOME becomes then the proportion of
- * delayed tasks to possibe threads, and FULL is the share of possible
+ * delayed tasks to possible threads, and FULL is the share of possible
* threads that are unproductive due to delays:
*
* threads = min(nr_nonidle_tasks, nr_cpus)
@@ -216,15 +219,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
{
switch (state) {
case PSI_IO_SOME:
- return tasks[NR_IOWAIT];
+ return unlikely(tasks[NR_IOWAIT]);
case PSI_IO_FULL:
- return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
+ return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
case PSI_MEM_SOME:
- return tasks[NR_MEMSTALL];
+ return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
- return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
+ return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > tasks[NR_ONCPU];
+ return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+ case PSI_CPU_FULL:
+ return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -441,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work)
mutex_unlock(&group->avgs_lock);
}
-/* Trigger tracking window manupulations */
+/* Trigger tracking window manipulations */
static void window_reset(struct psi_window *win, u64 now, u64 value,
u64 prev_growth)
{
@@ -639,13 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
}
-static void record_times(struct psi_group_cpu *groupc, int cpu,
- bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
{
u32 delta;
- u64 now;
- now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;
@@ -659,34 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
- else if (memstall_tick) {
- u32 sample;
- /*
- * Since we care about lost potential, a
- * memstall is FULL when there are no other
- * working tasks, but also when the CPU is
- * actively reclaiming and nothing productive
- * could run even if it were runnable.
- *
- * When the timer tick sees a reclaiming CPU,
- * regardless of runnable tasks, sample a FULL
- * tick (or less if it hasn't been a full tick
- * since the last state change).
- */
- sample = min(delta, (u32)jiffies_to_nsecs(1));
- groupc->times[PSI_MEM_FULL] += sample;
- }
}
- if (groupc->state_mask & (1 << PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
groupc->times[PSI_CPU_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_CPU_FULL))
+ groupc->times[PSI_CPU_FULL] += delta;
+ }
if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set,
+ unsigned int clear, unsigned int set, u64 now,
bool wake_clock)
{
struct psi_group_cpu *groupc;
@@ -706,19 +694,20 @@ static void psi_group_change(struct psi_group *group, int cpu,
*/
write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, false);
+ record_times(groupc, now);
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
- if (groupc->tasks[t] == 0 && !psi_bug) {
+ if (groupc->tasks[t]) {
+ groupc->tasks[t]--;
+ } else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set);
psi_bug = 1;
}
- groupc->tasks[t]--;
}
for (t = 0; set; set &= ~(1 << t), t++)
@@ -730,6 +719,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (test_state(groupc->tasks, s))
state_mask |= (1 << s);
}
+
+ /*
+ * Since we care about lost potential, a memstall is FULL
+ * when there are no other working tasks, but also when
+ * the CPU is actively reclaiming and nothing productive
+ * could run even if it were runnable. So when the current
+ * task in a cgroup is in_memstall, the corresponding groupc
+ * on that cpu is in PSI_MEM_FULL state.
+ */
+ if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+ state_mask |= (1 << PSI_MEM_FULL);
+
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
@@ -786,12 +787,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
+ now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
@@ -804,7 +807,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wake_clock = false;
while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, wake_clock);
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -813,56 +816,61 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
+ u64 now = cpu_clock(cpu);
if (next->pid) {
+ bool identical_state;
+
psi_flags_change(next, 0, TSK_ONCPU);
/*
- * When moving state between tasks, the group that
- * contains them both does not change: we can stop
- * updating the tree once we reach the first common
- * ancestor. Iterate @next's ancestors until we
- * encounter @prev's state.
+ * When switching between tasks that have an identical
+ * runtime state, the cgroup that contains both tasks
+ * runtime state, the cgroup that contains both tasks
+ * we reach the first common ancestor. Iterate @next's
+ * ancestors only until we encounter @prev's ONCPU.
*/
+ identical_state = prev->psi_flags == next->psi_flags;
iter = NULL;
while ((group = iterate_groups(next, &iter))) {
- if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ if (identical_state &&
+ per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
common = group;
break;
}
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
}
- /*
- * If this is a voluntary sleep, dequeue will have taken care
- * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
- * only need to deal with it during preemption.
- */
- if (sleep)
- return;
-
if (prev->pid) {
- psi_flags_change(prev, TSK_ONCPU, 0);
+ int clear = TSK_ONCPU, set = 0;
- iter = NULL;
- while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, TSK_ONCPU, 0, true);
- }
-}
+ /*
+ * When we're going to sleep, psi_dequeue() lets us handle
+ * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
+ * with TSK_ONCPU and save walking common ancestors twice.
+ */
+ if (sleep) {
+ clear |= TSK_RUNNING;
+ if (prev->in_iowait)
+ set |= TSK_IOWAIT;
+ }
-void psi_memstall_tick(struct task_struct *task, int cpu)
-{
- struct psi_group *group;
- void *iter = NULL;
+ psi_flags_change(prev, clear, set);
- while ((group = iterate_groups(task, &iter))) {
- struct psi_group_cpu *groupc;
+ iter = NULL;
+ while ((group = iterate_groups(prev, &iter)) && group != common)
+ psi_group_change(group, cpu, clear, set, now, true);
- groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, true);
- write_seqcount_end(&groupc->seq);
+ /*
+ * TSK_ONCPU is handled up to the common ancestor. If we're tasked
+ * with dequeuing too, finish that for the rest of the hierarchy.
+ */
+ if (sleep) {
+ clear &= ~TSK_ONCPU;
+ for (; group; group = iterate_groups(prev, &iter))
+ psi_group_change(group, cpu, clear, set, now, true);
+ }
}
}
@@ -964,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
- unsigned int task_flags = 0;
+ unsigned int task_flags;
struct rq_flags rf;
struct rq *rq;
@@ -979,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task)) {
- task_flags = TSK_RUNNING;
- if (task_current(rq, task))
- task_flags |= TSK_ONCPU;
- } else if (task->in_iowait)
- task_flags = TSK_IOWAIT;
-
- if (task->in_memstall)
- task_flags |= TSK_MEMSTALL;
+ /*
+ * We may race with schedule() dropping the rq lock between
+ * deactivating prev and switching to next. Because the psi
+ * updates from the deactivation are deferred to the switch
+ * callback to save cgroup tree updates, the task's scheduling
+ * state here is not coherent with its psi state:
+ *
+ * schedule() cgroup_move_task()
+ * rq_lock()
+ * deactivate_task()
+ * p->on_rq = 0
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+ * pick_next_task()
+ * rq_unlock()
+ * rq_lock()
+ * psi_task_change() // old cgroup
+ * task->cgroups = to
+ * psi_task_change() // new cgroup
+ * rq_unlock()
+ * rq_lock()
+ * psi_sched_switch() // does deferred updates in new cgroup
+ *
+ * Don't rely on the scheduling state. Use psi_flags instead.
+ */
+ task_flags = task->psi_flags;
if (task_flags)
psi_task_change(task, task_flags, 0);
@@ -1018,7 +1042,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+ for (full = 0; full < 2; full++) {
unsigned long avg[3];
u64 total;
int w;
@@ -1054,19 +1078,27 @@ static int psi_cpu_show(struct seq_file *m, void *v)
return psi_show(m, &psi_system, PSI_CPU);
}
+static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
+{
+ if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ return single_open(file, psi_show, NULL);
+}
+
static int psi_io_open(struct inode *inode, struct file *file)
{
- return single_open(file, psi_io_show, NULL);
+ return psi_open(file, psi_io_show);
}
static int psi_memory_open(struct inode *inode, struct file *file)
{
- return single_open(file, psi_memory_show, NULL);
+ return psi_open(file, psi_memory_show);
}
static int psi_cpu_open(struct inode *inode, struct file *file)
{
- return single_open(file, psi_cpu_show, NULL);
+ return psi_open(file, psi_cpu_show);
}
struct psi_trigger *psi_trigger_create(struct psi_group *group,
@@ -1346,9 +1378,9 @@ static int __init psi_proc_init(void)
{
if (psi_enable) {
proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
}
return 0;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8f720b71d13d..c286e5ba3c94 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -700,7 +700,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq)
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
- * indicate its been disabled and disalow stealing.
+ * indicate its been disabled and disallow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
@@ -1998,7 +1998,7 @@ static void push_rt_tasks(struct rq *rq)
*
* Each root domain has its own irq work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- * tassk must be checked if there's one or many CPUs that are lowering
+ * task must be checked if there's one or many CPUs that are lowering
* their priority, there's a single irq work iterator that will try to
* push off RT tasks that are waiting to run.
*
@@ -2216,7 +2216,7 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
* than what's currently running on its CPU.
- * This is just that p is wakeing up and hasn't
+ * This is just that p is waking up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..a189bec13729 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -36,6 +36,7 @@
#include <uapi/linux/sched/types.h>
#include <linux/binfmts.h>
+#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/context_tracking.h>
@@ -57,6 +58,7 @@
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
+#include <linux/ratelimit.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stop_machine.h>
@@ -205,6 +207,13 @@ static inline void update_avg(u64 *avg, u64 sample)
}
/*
+ * Shifting a value by an exponent greater *or equal* to the size of said value
+ * is UB; cap at size-1.
+ */
+#define shr_bound(val, shift) \
+ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
+
+/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
* This is actually gross. :(
@@ -963,6 +972,11 @@ struct rq {
atomic_t nr_iowait;
+#ifdef CONFIG_SCHED_DEBUG
+ u64 last_seen_need_resched_ns;
+ int ticks_without_resched;
+#endif
+
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
#endif
@@ -975,7 +989,6 @@ struct rq {
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
- unsigned char balance_push;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -1147,7 +1160,7 @@ static inline u64 __rq_clock_broken(struct rq *rq)
*
* if (rq-clock_update_flags >= RQCF_UPDATED)
*
- * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * to check if %RQCF_UPDATED is set. It'll never be shifted more than
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
@@ -1206,7 +1219,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
/*
* See rt task throttling, which is the only time a skip
- * request is cancelled.
+ * request is canceled.
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
@@ -1545,22 +1558,20 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-void register_sched_domain_sysctl(void);
+#ifdef CONFIG_SCHED_DEBUG
+void update_sched_domain_debugfs(void);
void dirty_sched_domain_sysctl(int cpu);
-void unregister_sched_domain_sysctl(void);
#else
-static inline void register_sched_domain_sysctl(void)
+static inline void update_sched_domain_debugfs(void)
{
}
static inline void dirty_sched_domain_sysctl(int cpu)
{
}
-static inline void unregister_sched_domain_sysctl(void)
-{
-}
#endif
+extern int sched_update_scaling(void);
+
extern void flush_smp_call_function_from_idle(void);
#else /* !CONFIG_SMP: */
@@ -1853,7 +1864,7 @@ struct sched_class {
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serliazed by
+ * cannot assume the switched_from/switched_to pair is serialized by
* rq->lock. They are however serialized by p->pi_lock.
*/
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
@@ -2358,7 +2369,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
-extern bool sched_debug_enabled;
+extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
@@ -2366,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2373,6 +2386,8 @@ extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf);
#endif /* CONFIG_NUMA_BALANCING */
+#else
+static inline void resched_latency_warn(int cpu, u64 latency) {}
#endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
@@ -2385,9 +2400,11 @@ extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
+#define NOHZ_NEWILB_KICK_BIT 2
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
@@ -2398,6 +2415,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
+#endif
#ifdef CONFIG_SMP
static inline
@@ -2437,7 +2459,7 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
- * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
* and never move forward.
*/
static inline u64 irq_time_read(int cpu)
@@ -2718,5 +2740,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
#endif
-void swake_up_all_locked(struct swait_queue_head *q);
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void swake_up_all_locked(struct swait_queue_head *q);
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+extern int preempt_dynamic_mode;
+extern int sched_dynamic_mode(const char *str);
+extern void sched_dynamic_update(int mode);
+#endif
+
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..3f93fc3b5648 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -74,7 +74,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..dc218e9f4558 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
- int clear = TSK_RUNNING, set = 0;
+ int clear = TSK_RUNNING;
if (static_branch_likely(&psi_disabled))
return;
- if (!sleep) {
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
- } else {
- /*
- * When a task sleeps, schedule() dequeues it before
- * switching to the next one. Merge the clearing of
- * TSK_RUNNING and TSK_ONCPU to save an unnecessary
- * psi_task_change() call in psi_sched_switch().
- */
- clear |= TSK_ONCPU;
+ /*
+ * A voluntary sleep is a dequeue followed by a task switch. To
+ * avoid walking all ancestors twice, psi_task_switch() handles
+ * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+ * Do nothing here.
+ */
+ if (sleep)
+ return;
- if (p->in_iowait)
- set |= TSK_IOWAIT;
- }
+ if (p->in_memstall)
+ clear |= TSK_MEMSTALL;
- psi_task_change(p, clear, set);
+ psi_task_change(p, clear, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -144,14 +140,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
-static inline void psi_task_tick(struct rq *rq)
-{
- if (static_branch_likely(&psi_disabled))
- return;
-
- if (unlikely(rq->curr->in_memstall))
- psi_memstall_tick(rq->curr, cpu_of(rq));
-}
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -159,7 +147,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 09d35044bd88..55a0a243e871 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2;
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = true;
+ sched_debug_verbose = true;
return 0;
}
-early_param("sched_debug", sched_debug_setup);
+early_param("sched_verbose", sched_debug_setup);
static inline bool sched_debug(void)
{
- return sched_debug_enabled;
+ return sched_debug_verbose;
}
#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
@@ -131,7 +131,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_debug_enabled)
+ if (!sched_debug_verbose)
return;
if (!sd) {
@@ -152,7 +152,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
-# define sched_debug_enabled 0
+# define sched_debug_verbose 0
# define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void)
{
@@ -723,35 +723,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
for (tmp = sd; tmp; tmp = tmp->parent)
numa_distance += !!(tmp->flags & SD_NUMA);
- /*
- * FIXME: Diameter >=3 is misrepresented.
- *
- * Smallest diameter=3 topology is:
- *
- * node 0 1 2 3
- * 0: 10 20 30 40
- * 1: 20 10 20 30
- * 2: 30 20 10 20
- * 3: 40 30 20 10
- *
- * 0 --- 1 --- 2 --- 3
- *
- * NUMA-3 0-3 N/A N/A 0-3
- * groups: {0-2},{1-3} {1-3},{0-2}
- *
- * NUMA-2 0-2 0-3 0-3 1-3
- * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
- *
- * NUMA-1 0-1 0-2 1-3 2-3
- * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
- *
- * NUMA-0 0 1 2 3
- *
- * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
- * group span isn't a subset of the domain span.
- */
- WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
-
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
@@ -963,7 +934,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
int cpu;
build_balance_mask(sd, sg, mask);
- cpu = cpumask_first_and(sched_group_span(sg), mask);
+ cpu = cpumask_first(mask);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
if (atomic_inc_return(&sg->sgc->ref) == 1)
@@ -982,6 +953,31 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
+static struct sched_domain *
+find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
+{
+ /*
+ * The proper descendant would be the one whose child won't span out
+ * of sd
+ */
+ while (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child),
+ sched_domain_span(sd)))
+ sibling = sibling->child;
+
+ /*
+ * As we are referencing sgc across different topology level, we need
+ * to go down to skip those sched_domains which don't contribute to
+ * scheduling because they will be degenerated in cpu_attach_domain
+ */
+ while (sibling->child &&
+ cpumask_equal(sched_domain_span(sibling->child),
+ sched_domain_span(sibling)))
+ sibling = sibling->child;
+
+ return sibling;
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -1015,6 +1011,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
+ /*
+ * Usually we build sched_group by sibling's child sched_domain
+ * But for machines whose NUMA diameter are 3 or above, we move
+ * to build sched_group by sibling's proper descendant's child
+ * domain because sibling's child sched_domain will span out of
+ * the sched_domain being built as below.
+ *
+ * Smallest diameter=3 topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 40
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 40 30 20 10
+ *
+ * 0 --- 1 --- 2 --- 3
+ *
+ * NUMA-3 0-3 N/A N/A 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-2 0-2 0-3 0-3 1-3
+ * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
+ *
+ * NUMA-1 0-1 0-2 1-3 2-3
+ * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
+ * group span isn't a subset of the domain span.
+ */
+ if (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child), span))
+ sibling = find_descended_sibling(sd, sibling);
+
sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
@@ -1022,7 +1053,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- init_overlap_sched_group(sd, sg);
+ init_overlap_sched_group(sibling, sg);
if (!first)
first = sg;
@@ -2110,7 +2141,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_asym)
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
- if (rq && sched_debug_enabled) {
+ if (rq && sched_debug_verbose) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
@@ -2128,7 +2159,7 @@ static cpumask_var_t *doms_cur;
/* Number of sched domains in 'doms_cur': */
static int ndoms_cur;
-/* Attribues of custom domains in 'doms_cur' */
+/* Attributes of custom domains in 'doms_cur' */
static struct sched_domain_attr *dattr_cur;
/*
@@ -2192,7 +2223,6 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = &fallback_doms;
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
return err;
}
@@ -2267,9 +2297,6 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
lockdep_assert_held(&sched_domains_mutex);
- /* Always unregister in case we don't destroy any domains: */
- unregister_sched_domain_sysctl();
-
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
@@ -2358,7 +2385,7 @@ match3:
dattr_cur = dattr_new;
ndoms_cur = ndoms_new;
- register_sched_domain_sysctl();
+ update_sched_domain_debugfs();
}
/*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1e63db4dbd9a..6ecd3f3a52b5 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -119,8 +119,11 @@ struct seccomp_kaddfd {
int fd;
unsigned int flags;
- /* To only be set on reply */
- int ret;
+ union {
+ bool setfd;
+ /* To only be set on reply */
+ int ret;
+ };
struct completion completion;
struct list_head list;
};
@@ -1069,7 +1072,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
* that it has been handled.
*/
list_del_init(&addfd->list);
- addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+ if (!addfd->setfd)
+ addfd->ret = receive_fd(addfd->file, addfd->flags);
+ else
+ addfd->ret = receive_fd_replace(addfd->fd, addfd->file,
+ addfd->flags);
complete(&addfd->completion);
}
@@ -1583,8 +1590,8 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
return -EBADF;
kaddfd.flags = addfd.newfd_flags;
- kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
- addfd.newfd : -1;
+ kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
+ kaddfd.fd = addfd.newfd;
init_completion(&kaddfd.completion);
ret = mutex_lock_interruptible(&filter->notify_lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index dca53515ae3f..f7c6ffcbd044 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -408,7 +408,8 @@ void task_join_group_stop(struct task_struct *task)
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
+ int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
struct user_struct *user;
@@ -430,7 +431,16 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
rcu_read_unlock();
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
- q = kmem_cache_alloc(sigqueue_cachep, flags);
+ /*
+ * Preallocation does not hold sighand::siglock so it can't
+ * use the cache. The lockless caching requires that only
+ * one consumer and only one producer run at a time.
+ */
+ q = READ_ONCE(t->sigqueue_cache);
+ if (!q || sigqueue_flags)
+ q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
+ else
+ WRITE_ONCE(t->sigqueue_cache, NULL);
} else {
print_dropped_signal(sig);
}
@@ -440,20 +450,51 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
- q->flags = 0;
+ q->flags = sigqueue_flags;
q->user = user;
}
return q;
}
+void exit_task_sigqueue_cache(struct task_struct *tsk)
+{
+ /* Race free because @tsk is mopped up */
+ struct sigqueue *q = tsk->sigqueue_cache;
+
+ if (q) {
+ tsk->sigqueue_cache = NULL;
+ /*
+ * Hand it back to the cache as the task might
+ * be self reaping which would leak the object.
+ */
+ kmem_cache_free(sigqueue_cachep, q);
+ }
+}
+
+static void sigqueue_cache_or_free(struct sigqueue *q)
+{
+ /*
+ * Cache one sigqueue per task. This pairs with the consumer side
+ * in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the
+ * compiler from store tearing and to tell KCSAN that the data race
+ * is intentional when run without holding current->sighand->siglock,
+ * which is fine as current obviously cannot run __sigqueue_free()
+ * concurrently.
+ */
+ if (!READ_ONCE(current->sigqueue_cache))
+ WRITE_ONCE(current->sigqueue_cache, q);
+ else
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
if (atomic_dec_and_test(&q->user->sigpending))
free_uid(q->user);
- kmem_cache_free(sigqueue_cachep, q);
+ sigqueue_cache_or_free(q);
}
void flush_sigqueue(struct sigpending *queue)
@@ -1111,7 +1152,8 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
else
override_rlimit = 0;
- q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
+ q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);
+
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
@@ -1822,12 +1864,7 @@ EXPORT_SYMBOL(kill_pid);
*/
struct sigqueue *sigqueue_alloc(void)
{
- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
-
- if (q)
- q->flags |= SIGQUEUE_PREALLOC;
-
- return q;
+ return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
}
void sigqueue_free(struct sigqueue *q)
diff --git a/kernel/smp.c b/kernel/smp.c
index f472ef623956..52bf159ec400 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
} while (0)
/* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(call_single_data_t *csd)
+static void __csd_lock_record(struct __call_single_data *csd)
{
if (!csd) {
smp_mb(); /* NULL cur_csd after unlock. */
@@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd)
/* Or before unlock, as the case may be. */
}
-static __always_inline void csd_lock_record(call_single_data_t *csd)
+static __always_inline void csd_lock_record(struct __call_single_data *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled))
__csd_lock_record(csd);
}
-static int csd_lock_wait_getcpu(call_single_data_t *csd)
+static int csd_lock_wait_getcpu(struct __call_single_data *csd)
{
unsigned int csd_type;
@@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type)
return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
}
-static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
+static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
{
struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
unsigned int srccpu = csd->node.src;
@@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
* so waiting on other types gets much less information.
*/
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
{
int cpu = -1;
int cpux;
@@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void __csd_lock_wait(call_single_data_t *csd)
+static void __csd_lock_wait(struct __call_single_data *csd)
{
int bug_id = 0;
u64 ts0, ts1;
@@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd)
smp_acquire__after_ctrl_dep();
}
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled)) {
__csd_lock_wait(csd);
@@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
#else
#define cfd_seq_store(var, src, dst, type)
-static void csd_lock_record(call_single_data_t *csd)
+static void csd_lock_record(struct __call_single_data *csd)
{
}
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif
-static __always_inline void csd_lock(call_single_data_t *csd)
+static __always_inline void csd_lock(struct __call_single_data *csd)
{
csd_lock_wait(csd);
csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd)
smp_wmb();
}
-static __always_inline void csd_unlock(call_single_data_t *csd)
+static __always_inline void csd_unlock(struct __call_single_data *csd)
{
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
@@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, call_single_data_t *csd)
+static int generic_exec_single(int cpu, struct __call_single_data *csd)
{
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
@@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
int err = 0;
@@ -850,12 +850,28 @@ call:
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
+/*
+ * Flags to be used as scf_flags argument of smp_call_function_many_cond().
+ *
+ * %SCF_WAIT: Wait until function execution is completed
+ * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask
+ */
+#define SCF_WAIT (1U << 0)
+#define SCF_RUN_LOCAL (1U << 1)
+
static void smp_call_function_many_cond(const struct cpumask *mask,
smp_call_func_t func, void *info,
- bool wait, smp_cond_func_t cond_func)
+ unsigned int scf_flags,
+ smp_cond_func_t cond_func)
{
+ int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
- int cpu, next_cpu, this_cpu = smp_processor_id();
+ bool wait = scf_flags & SCF_WAIT;
+ bool run_remote = false;
+ bool run_local = false;
+ int nr_cpus = 0;
+
+ lockdep_assert_preemption_disabled();
/*
* Can deadlock when called with interrupts disabled.
@@ -863,8 +879,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
- WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress && !early_boot_irqs_disabled);
+ if (cpu_online(this_cpu) && !oops_in_progress &&
+ !early_boot_irqs_disabled)
+ lockdep_assert_irqs_enabled();
/*
* When @wait we can deadlock when we interrupt between llist_add() and
@@ -874,70 +891,75 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
*/
WARN_ON_ONCE(!in_task());
- /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
+ /* Check if we need local execution. */
+ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
+ run_local = true;
+
+ /* Check if we need remote execution, i.e., any CPU excluding this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+ if (cpu < nr_cpu_ids)
+ run_remote = true;
- /* No online cpus? We're done. */
- if (cpu >= nr_cpu_ids)
- return;
+ if (run_remote) {
+ cfd = this_cpu_ptr(&cfd_data);
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ __cpumask_clear_cpu(this_cpu, cfd->cpumask);
- /* Do we have another CPU which isn't us? */
- next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
- if (next_cpu == this_cpu)
- next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
-
- /* Fastpath: do that cpu by itself. */
- if (next_cpu >= nr_cpu_ids) {
- if (!cond_func || cond_func(cpu, info))
- smp_call_function_single(cpu, func, info, wait);
- return;
- }
+ cpumask_clear(cfd->cpumask_ipi);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
+ call_single_data_t *csd = &pcpu->csd;
- cfd = this_cpu_ptr(&cfd_data);
+ if (cond_func && !cond_func(cpu, info))
+ continue;
- cpumask_and(cfd->cpumask, mask, cpu_online_mask);
- __cpumask_clear_cpu(this_cpu, cfd->cpumask);
+ csd_lock(csd);
+ if (wait)
+ csd->node.u_flags |= CSD_TYPE_SYNC;
+ csd->func = func;
+ csd->info = info;
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+ csd->node.src = smp_processor_id();
+ csd->node.dst = cpu;
+#endif
+ cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
+ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
+ __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
+ nr_cpus++;
+ last_cpu = cpu;
- /* Some callers race with other cpus changing the passed mask */
- if (unlikely(!cpumask_weight(cfd->cpumask)))
- return;
+ cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
+ } else {
+ cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
+ }
+ }
- cpumask_clear(cfd->cpumask_ipi);
- for_each_cpu(cpu, cfd->cpumask) {
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
- call_single_data_t *csd = &pcpu->csd;
+ cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING);
- if (cond_func && !cond_func(cpu, info))
- continue;
+ /*
+ * Choose the most efficient way to send an IPI. Note that the
+ * number of CPUs might be zero due to concurrent changes to the
+ * provided mask.
+ */
+ if (nr_cpus == 1)
+ send_call_function_single_ipi(last_cpu);
+ else if (likely(nr_cpus > 1))
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
- csd_lock(csd);
- if (wait)
- csd->node.u_flags |= CSD_TYPE_SYNC;
- csd->func = func;
- csd->info = info;
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- csd->node.src = smp_processor_id();
- csd->node.dst = cpu;
-#endif
- cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
- if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
- __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
- cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
- } else {
- cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
- }
+ cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED);
}
- /* Send a message to all CPUs in the map */
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu,
- CFD_SEQ_NOCPU, CFD_SEQ_PING);
- arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu,
- CFD_SEQ_NOCPU, CFD_SEQ_PINGED);
+ if (run_local && (!cond_func || cond_func(this_cpu, info))) {
+ unsigned long flags;
- if (wait) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+
+ if (run_remote && wait) {
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd;
@@ -948,12 +970,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
}
/**
- * smp_call_function_many(): Run a function on a set of other CPUs.
+ * smp_call_function_many(): Run a function on a set of CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
+ * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait
+ * (atomically) until function has completed on other CPUs. If
+ * %SCF_RUN_LOCAL is set, the function will also be run locally
+ * if the local CPU is set in the @cpumask.
*
* If @wait is true, then returns once @func has returned.
*
@@ -964,7 +988,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- smp_call_function_many_cond(mask, func, info, wait, NULL);
+ smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);
@@ -1076,56 +1100,6 @@ void __init smp_init(void)
}
/*
- * Call a function on all processors. May be used during early boot while
- * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
- * of local_irq_disable/enable().
- */
-void on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
- unsigned long flags;
-
- preempt_disable();
- smp_call_function(func, info, wait);
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- preempt_enable();
-}
-EXPORT_SYMBOL(on_each_cpu);
-
-/**
- * on_each_cpu_mask(): Run a function on processors specified by
- * cpumask, which may include the local processor.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. The
- * exception is that it may be used during early boot while
- * early_boot_irqs_disabled is set.
- */
-void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
- void *info, bool wait)
-{
- int cpu = get_cpu();
-
- smp_call_function_many(mask, func, info, wait);
- if (cpumask_test_cpu(cpu, mask)) {
- unsigned long flags;
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- }
- put_cpu();
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-
-/*
* on_each_cpu_cond(): Call a function on each processor for which
* the supplied function cond_func returns true, optionally waiting
* for all the required CPUs to finish. This may include the local
@@ -1150,27 +1124,17 @@ EXPORT_SYMBOL(on_each_cpu_mask);
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask)
{
- int cpu = get_cpu();
+ unsigned int scf_flags = SCF_RUN_LOCAL;
- smp_call_function_many_cond(mask, func, info, wait, cond_func);
- if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
- unsigned long flags;
+ if (wait)
+ scf_flags |= SCF_WAIT;
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- }
- put_cpu();
+ preempt_disable();
+ smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
+ preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait)
-{
- on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-
static void do_nothing(void *unused)
{
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 971d8acceaec..cbc30271ea4d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -409,6 +409,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
work->fn = fn;
work->arg = arg;
work->done = done;
+ work->caller = _RET_IP_;
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 3d62c9599dc0..3a583a29815f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1590,7 +1590,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
/*
* RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
- * infite. In case of RLIM_INFINITY the posix CPU timer code
+ * infinite. In case of RLIM_INFINITY the posix CPU timer code
* ignores the rlimit.
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
@@ -2029,7 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
}
/*
- * arg_lock protects concurent updates but we still need mmap_lock for
+ * arg_lock protects concurrent updates but we still need mmap_lock for
* read to exclude races with sys_brk.
*/
mmap_read_lock(mm);
@@ -2041,7 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
* output in procfs mostly, except
*
* - @start_brk/@brk which are used in do_brk_flags but kernel lookups
- * for VMAs when updating these memvers so anything wrong written
+ * for VMAs when updating these members so anything wrong written
* here cause kernel to swear at userspace program but won't lead
* to any problem in kernel itself
*/
@@ -2143,7 +2143,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
/*
- * arg_lock protects concurent updates of arg boundaries, we need
+ * arg_lock protects concurrent updates of arg boundaries, we need
* mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
* validation.
*/
@@ -2210,7 +2210,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
* If command line arguments and environment
* are placed somewhere else on stack, we can
* set them up here, ARG_START/END to setup
- * command line argumets and ENV_START/END
+ * command line arguments and ENV_START/END
* for environment.
*/
case PR_SET_MM_START_STACK:
@@ -2258,8 +2258,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti
static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
/*
- * If task has has_child_subreaper - all its decendants
- * already have these flag too and new decendants will
+ * If task has has_child_subreaper - all its descendants
+ * already have these flag too and new descendants will
* inherit it on fork, skip them.
*
* If we've found child_reaper - skip descendants in
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 19aa806890d5..0ea8128468c3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -99,6 +99,7 @@ COND_SYSCALL(flock);
/* fs/quota.c */
COND_SYSCALL(quotactl);
+COND_SYSCALL(quotactl_path);
/* fs/readdir.c */
@@ -266,6 +267,11 @@ COND_SYSCALL(request_key);
COND_SYSCALL(keyctl);
COND_SYSCALL_COMPAT(keyctl);
+/* security/landlock/syscalls.c */
+COND_SYSCALL(landlock_create_ruleset);
+COND_SYSCALL(landlock_add_rule);
+COND_SYSCALL(landlock_restrict_self);
+
/* arch/example/kernel/sys_example.c */
/* mm/fadvise.c */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..14edf84cc571 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -148,6 +148,9 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
+#ifdef CONFIG_FANOTIFY
+#include <linux/fanotify.h>
+#endif
#ifdef CONFIG_PROC_SYSCTL
@@ -184,17 +187,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
-
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
static int max_extfrag_threshold = 1000;
@@ -1034,6 +1026,65 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
+/**
+ * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or an error on write when the range check fails.
+ */
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp;
+ unsigned int min = 0, max = 255U, val;
+ u8 *data = table->data;
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = &min,
+ .max = &max,
+ };
+ int res;
+
+ /* Do not support arrays yet. */
+ if (table->maxlen != sizeof(u8))
+ return -EINVAL;
+
+ if (table->extra1) {
+ min = *(unsigned int *) table->extra1;
+ if (min > 255U)
+ return -EINVAL;
+ }
+ if (table->extra2) {
+ max = *(unsigned int *) table->extra2;
+ if (max > 255U)
+ return -EINVAL;
+ }
+
+ tmp = *table;
+
+ tmp.maxlen = sizeof(val);
+ tmp.data = &val;
+ val = *data;
+ res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+ if (res)
+ return res;
+ if (write)
+ *data = val;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
+
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
@@ -1582,6 +1633,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -1659,58 +1716,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_SCHED_DEBUG
- {
- .procname = "sched_min_granularity_ns",
- .data = &sysctl_sched_min_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_latency_ns",
- .data = &sysctl_sched_latency,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_wakeup_granularity_ns",
- .data = &sysctl_sched_wakeup_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_wakeup_granularity_ns,
- .extra2 = &max_wakeup_granularity_ns,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "sched_tunable_scaling",
- .data = &sysctl_sched_tunable_scaling,
- .maxlen = sizeof(enum sched_tunable_scaling),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_tunable_scaling,
- .extra2 = &max_sched_tunable_scaling,
- },
- {
- .procname = "sched_migration_cost_ns",
- .data = &sysctl_sched_migration_cost,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_nr_migrate",
- .data = &sysctl_sched_nr_migrate,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
@@ -1722,38 +1727,8 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
-#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
- .procname = "numa_balancing_scan_delay_ms",
- .data = &sysctl_numa_balancing_scan_delay,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_min_ms",
- .data = &sysctl_numa_balancing_scan_period_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_max_ms",
- .data = &sysctl_numa_balancing_scan_period_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_size_mb",
- .data = &sysctl_numa_balancing_scan_size,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
- {
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(unsigned int),
@@ -1763,7 +1738,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
@@ -2856,7 +2830,7 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_COMPACTION
{
.procname = "compact_memory",
- .data = &sysctl_compact_memory,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = sysctl_compaction_handler,
@@ -3258,7 +3232,14 @@ static struct ctl_table fs_table[] = {
.mode = 0555,
.child = inotify_table,
},
-#endif
+#endif
+#ifdef CONFIG_FANOTIFY
+ {
+ .procname = "fanotify",
+ .mode = 0555,
+ .child = fanotify_table,
+ },
+#endif
#ifdef CONFIG_EPOLL
{
.procname = "epoll",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 9cde961875c0..1698fbe6f0e1 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -34,6 +34,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
{
struct callback_head *head;
+ /* record the work call stack in order to print it in KASAN reports */
+ kasan_record_aux_stack(work);
+
do {
head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
@@ -59,18 +62,17 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
}
/**
- * task_work_cancel - cancel a pending work added by task_work_add()
+ * task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
- * @func: identifies the work to remove
- *
- * Find the last queued pending work with ->func == @func and remove
- * it from queue.
+ * @match: match function to call
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_match(struct task_struct *task,
+ bool (*match)(struct callback_head *, void *data),
+ void *data)
{
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
@@ -86,7 +88,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
while ((work = READ_ONCE(*pprev))) {
- if (work->func != func)
+ if (!match(work, data))
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
break;
@@ -96,6 +98,28 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
return work;
}
+static bool task_work_func_match(struct callback_head *cb, void *data)
+{
+ return cb->func == data;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+ return task_work_cancel_match(task, task_work_func_match, func);
+}
+
/**
* task_work_run - execute the works added by task_work_add()
*
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index bea9d08b1698..5897828b9d7e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -92,7 +92,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (rtcdev)
return -EBUSY;
- if (!rtc->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, rtc->features))
return -1;
if (!device_may_wakeup(rtc->dev.parent))
return -1;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1d1a61371b5a..2cd902592fc1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -920,6 +920,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_arch_init(cs);
+ if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+ cs->id = CSID_GENERIC;
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 81fe2a33b80c..8a364aa9881a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1048,6 +1048,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
do {
seq = read_seqcount_begin(&tk_core.seq);
now = tk_clock_read(&tk->tkr_mono);
+ systime_snapshot->cs_id = tk->tkr_mono.clock->id;
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b0c45d923f0f..d2d7cf6cfe83 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -372,188 +372,34 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
return &bpf_probe_write_user_proto;
}
-static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
- size_t bufsz)
-{
- void __user *user_ptr = (__force void __user *)unsafe_ptr;
-
- buf[0] = 0;
-
- switch (fmt_ptype) {
- case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- if ((unsigned long)unsafe_ptr < TASK_SIZE) {
- strncpy_from_user_nofault(buf, user_ptr, bufsz);
- break;
- }
- fallthrough;
-#endif
- case 'k':
- strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
- break;
- case 'u':
- strncpy_from_user_nofault(buf, user_ptr, bufsz);
- break;
- }
-}
-
static DEFINE_RAW_SPINLOCK(trace_printk_lock);
-#define BPF_TRACE_PRINTK_SIZE 1024
+#define MAX_TRACE_PRINTK_VARARGS 3
+#define BPF_TRACE_PRINTK_SIZE 1024
-static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+ u64, arg2, u64, arg3)
{
+ u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
+ u32 *bin_args;
static char buf[BPF_TRACE_PRINTK_SIZE];
unsigned long flags;
- va_list ap;
int ret;
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
+ MAX_TRACE_PRINTK_VARARGS);
+ if (ret < 0)
+ return ret;
+
raw_spin_lock_irqsave(&trace_printk_lock, flags);
- va_start(ap, fmt);
- ret = vsnprintf(buf, sizeof(buf), fmt, ap);
- va_end(ap);
- /* vsnprintf() will not append null for zero-length strings */
- if (ret == 0)
- buf[0] = '\0';
+ ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+
trace_bpf_trace_printk(buf);
raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
- return ret;
-}
-
-/*
- * Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
- */
-BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
- u64, arg2, u64, arg3)
-{
- int i, mod[3] = {}, fmt_cnt = 0;
- char buf[64], fmt_ptype;
- void *unsafe_ptr = NULL;
- bool str_seen = false;
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- return -EINVAL;
-
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
- return -EINVAL;
-
- if (fmt[i] != '%')
- continue;
-
- if (fmt_cnt >= 3)
- return -EINVAL;
-
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- } else if (fmt[i] == 'p') {
- mod[fmt_cnt]++;
- if ((fmt[i + 1] == 'k' ||
- fmt[i + 1] == 'u') &&
- fmt[i + 2] == 's') {
- fmt_ptype = fmt[i + 1];
- i += 2;
- goto fmt_str;
- }
-
- if (fmt[i + 1] == 'B') {
- i++;
- goto fmt_next;
- }
-
- /* disallow any further format extensions */
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
-
- goto fmt_next;
- } else if (fmt[i] == 's') {
- mod[fmt_cnt]++;
- fmt_ptype = fmt[i];
-fmt_str:
- if (str_seen)
- /* allow only one '%s' per fmt string */
- return -EINVAL;
- str_seen = true;
-
- if (fmt[i + 1] != 0 &&
- !isspace(fmt[i + 1]) &&
- !ispunct(fmt[i + 1]))
- return -EINVAL;
-
- switch (fmt_cnt) {
- case 0:
- unsafe_ptr = (void *)(long)arg1;
- arg1 = (long)buf;
- break;
- case 1:
- unsafe_ptr = (void *)(long)arg2;
- arg2 = (long)buf;
- break;
- case 2:
- unsafe_ptr = (void *)(long)arg3;
- arg3 = (long)buf;
- break;
- }
-
- bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
- sizeof(buf));
- goto fmt_next;
- }
-
- if (fmt[i] == 'l') {
- mod[fmt_cnt]++;
- i++;
- }
-
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x')
- return -EINVAL;
-fmt_next:
- fmt_cnt++;
- }
+ bpf_bprintf_cleanup();
-/* Horrid workaround for getting va_list handling working with different
- * argument type combinations generically for 32 and 64 bit archs.
- */
-#define __BPF_TP_EMIT() __BPF_ARG3_TP()
-#define __BPF_TP(...) \
- bpf_do_trace_printk(fmt, ##__VA_ARGS__)
-
-#define __BPF_ARG1_TP(...) \
- ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_TP(arg1, ##__VA_ARGS__) \
- : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
- : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
-
-#define __BPF_ARG2_TP(...) \
- ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
- : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
- : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
-
-#define __BPF_ARG3_TP(...) \
- ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
- ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
- : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
- ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
- : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
-
- return __BPF_TP_EMIT();
+ return ret;
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -581,184 +427,27 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
}
#define MAX_SEQ_PRINTF_VARARGS 12
-#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
-#define MAX_SEQ_PRINTF_STR_LEN 128
-
-struct bpf_seq_printf_buf {
- char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
-};
-static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
-static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
const void *, data, u32, data_len)
{
- int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
- int i, buf_used, copy_size, num_args;
- u64 params[MAX_SEQ_PRINTF_VARARGS];
- struct bpf_seq_printf_buf *bufs;
- const u64 *args = data;
-
- buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
- if (WARN_ON_ONCE(buf_used > 1)) {
- err = -EBUSY;
- goto out;
- }
-
- bufs = this_cpu_ptr(&bpf_seq_printf_buf);
-
- /*
- * bpf_check()->check_func_arg()->check_stack_boundary()
- * guarantees that fmt points to bpf program stack,
- * fmt_size bytes of it were initialized and fmt_size > 0
- */
- if (fmt[--fmt_size] != 0)
- goto out;
-
- if (data_len & 7)
- goto out;
-
- for (i = 0; i < fmt_size; i++) {
- if (fmt[i] == '%') {
- if (fmt[i + 1] == '%')
- i++;
- else if (!data || !data_len)
- goto out;
- }
- }
+ int err, num_args;
+ u32 *bin_args;
+ if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
num_args = data_len / 8;
- /* check format string for allowed specifiers */
- for (i = 0; i < fmt_size; i++) {
- /* only printable ascii for now. */
- if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
- err = -EINVAL;
- goto out;
- }
-
- if (fmt[i] != '%')
- continue;
-
- if (fmt[i + 1] == '%') {
- i++;
- continue;
- }
-
- if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
- err = -E2BIG;
- goto out;
- }
-
- if (fmt_cnt >= num_args) {
- err = -EINVAL;
- goto out;
- }
-
- /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
- i++;
-
- /* skip optional "[0 +-][num]" width formating field */
- while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
- fmt[i] == ' ')
- i++;
- if (fmt[i] >= '1' && fmt[i] <= '9') {
- i++;
- while (fmt[i] >= '0' && fmt[i] <= '9')
- i++;
- }
-
- if (fmt[i] == 's') {
- void *unsafe_ptr;
-
- /* try our best to copy */
- if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
- err = -E2BIG;
- goto out;
- }
-
- unsafe_ptr = (void *)(long)args[fmt_cnt];
- err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
- unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
- if (err < 0)
- bufs->buf[memcpy_cnt][0] = '\0';
- params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
-
- fmt_cnt++;
- memcpy_cnt++;
- continue;
- }
-
- if (fmt[i] == 'p') {
- if (fmt[i + 1] == 0 ||
- fmt[i + 1] == 'K' ||
- fmt[i + 1] == 'x' ||
- fmt[i + 1] == 'B') {
- /* just kernel pointers */
- params[fmt_cnt] = args[fmt_cnt];
- fmt_cnt++;
- continue;
- }
-
- /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
- if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
- err = -EINVAL;
- goto out;
- }
- if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
- err = -EINVAL;
- goto out;
- }
-
- if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
- err = -E2BIG;
- goto out;
- }
-
-
- copy_size = (fmt[i + 2] == '4') ? 4 : 16;
-
- err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt],
- (void *) (long) args[fmt_cnt],
- copy_size);
- if (err < 0)
- memset(bufs->buf[memcpy_cnt], 0, copy_size);
- params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
-
- i += 2;
- fmt_cnt++;
- memcpy_cnt++;
- continue;
- }
-
- if (fmt[i] == 'l') {
- i++;
- if (fmt[i] == 'l')
- i++;
- }
-
- if (fmt[i] != 'i' && fmt[i] != 'd' &&
- fmt[i] != 'u' && fmt[i] != 'x' &&
- fmt[i] != 'X') {
- err = -EINVAL;
- goto out;
- }
+ err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ if (err < 0)
+ return err;
- params[fmt_cnt] = args[fmt_cnt];
- fmt_cnt++;
- }
+ seq_bprintf(m, fmt, bin_args);
- /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
- * all of them to seq_printf().
- */
- seq_printf(m, fmt, params[0], params[1], params[2], params[3],
- params[4], params[5], params[6], params[7], params[8],
- params[9], params[10], params[11]);
+ bpf_bprintf_cleanup();
- err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
-out:
- this_cpu_dec(bpf_seq_printf_buf_used);
- return err;
+ return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}
BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
@@ -1367,6 +1056,14 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_task_storage_get:
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ return &bpf_task_storage_delete_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
default:
return NULL;
}
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 29a6ebeebc9e..b8a0d1d564fb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void)
}
/**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
+ * ftrace_graph_stop - set to permanently disable function graph tracing
*
* In case of an error int function graph tracing, this is called
* to try to keep function graph tracing from causing any more harm.
@@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
/*
* Skip graph tracing if the return location is served by direct trampoline,
- * since call sequence and return addresses is unpredicatable anymore.
+ * since call sequence and return addresses are unpredictable anyway.
* Ex: BPF trampoline may call original function and may skip frame
* depending on type of BPF programs attached.
*/
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3ba52d4e1314..2e8a3fde7104 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = {
};
/*
- * Used by the stack undwinder to know about dynamic ftrace trampolines.
+ * Used by the stack unwinder to know about dynamic ftrace trampolines.
*/
struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
@@ -1090,7 +1090,7 @@ struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
int index;
- int size;
+ int order;
};
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
@@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
* When the kernel is preemptive, tasks can be preempted
* while on a ftrace trampoline. Just scheduling a task on
* a CPU is not good enough to flush them. Calling
- * synchornize_rcu_tasks() will wait for those tasks to
+ * synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
if (IS_ENABLED(CONFIG_PREEMPTION))
@@ -3156,15 +3156,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
if (WARN_ON(!count))
return -EINVAL;
+ /* We want to fill as much as possible, with no empty pages */
pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
- order = get_count_order(pages);
-
- /*
- * We want to fill as much as possible. No more than a page
- * may be empty.
- */
- if (!is_power_of_2(pages))
- order--;
+ order = fls(pages) - 1;
again:
pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
@@ -3181,7 +3175,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
ftrace_number_of_groups++;
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
- pg->size = cnt;
+ pg->order = order;
if (cnt > count)
cnt = count;
@@ -3194,7 +3188,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
- int order;
int cnt;
if (!num_to_init)
@@ -3230,13 +3223,13 @@ ftrace_allocate_pages(unsigned long num_to_init)
free_pages:
pg = start_pg;
while (pg) {
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- if (order >= 0)
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
start_pg = pg->next;
kfree(pg);
pg = start_pg;
- ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
}
pr_info("ftrace: FAILED to allocate memory for functions\n");
@@ -5407,7 +5400,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
* @reset - non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
- * If @ip is NULL, it failes to update filter.
+ * If @ip is NULL, it fails to update filter.
*/
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
@@ -5631,7 +5624,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
- ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+ int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
+ ftrace_process_regex(iter, parser->buffer,
+ parser->idx, enable);
}
trace_parser_put(parser);
@@ -6221,6 +6217,7 @@ static int ftrace_process_locs(struct module *mod,
p = start;
pg = start_pg;
while (p < end) {
+ unsigned long end_offset;
addr = ftrace_call_adjust(*p++);
/*
* Some architecture linkers will pad between
@@ -6231,7 +6228,8 @@ static int ftrace_process_locs(struct module *mod,
if (!addr)
continue;
- if (pg->index == pg->size) {
+ end_offset = (pg->index+1) * sizeof(pg->records[0]);
+ if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
if (WARN_ON(!pg->next))
break;
@@ -6359,7 +6357,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
}
}
-/* Clear any records from hashs */
+/* Clear any records from hashes */
static void clear_mod_from_hashes(struct ftrace_page *pg)
{
struct trace_array *tr;
@@ -6400,7 +6398,6 @@ void ftrace_release_mod(struct module *mod)
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
- int order;
mutex_lock(&ftrace_lock);
@@ -6451,12 +6448,12 @@ void ftrace_release_mod(struct module *mod)
/* Needs to be called outside of ftrace_lock */
clear_mod_from_hashes(pg);
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- if (order >= 0)
- free_pages((unsigned long)pg->records, order);
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
tmp_page = pg->next;
kfree(pg);
- ftrace_number_of_pages -= 1 << order;
ftrace_number_of_groups--;
}
}
@@ -6774,7 +6771,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
struct ftrace_mod_map *mod_map = NULL;
struct ftrace_init_func *func, *func_next;
struct list_head clear_hash;
- int order;
INIT_LIST_HEAD(&clear_hash);
@@ -6812,10 +6808,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- if (order >= 0)
- free_pages((unsigned long)pg->records, order);
- ftrace_number_of_pages -= 1 << order;
+ if (pg->records) {
+ free_pages((unsigned long)pg->records, pg->order);
+ ftrace_number_of_pages -= 1 << pg->order;
+ }
ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 68744c51517e..2c0ee6484990 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -287,17 +287,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
-/**
- * ring_buffer_event_time_stamp - return the event's extended timestamp
- * @event: the event to get the timestamp of
- *
- * Returns the extended timestamp associated with a data event.
- * An extended time_stamp is a 64-bit timestamp represented
- * internally in a special way that makes the best use of space
- * contained within a ring buffer event. This function decodes
- * it and maps it to a straight u64 value.
- */
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -487,6 +477,8 @@ struct rb_time_struct {
#endif
typedef struct rb_time_struct rb_time_t;
+#define MAX_NEST 5
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -524,6 +516,7 @@ struct ring_buffer_per_cpu {
unsigned long read_bytes;
rb_time_t write_stamp;
rb_time_t before_stamp;
+ u64 event_stamp[MAX_NEST];
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -749,6 +742,99 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
}
#endif
+/*
+ * Enable this to make sure that the event passed to
+ * ring_buffer_event_time_stamp() is not committed and also
+ * is on the buffer that it passed in.
+ */
+//#define RB_VERIFY_EVENT
+#ifdef RB_VERIFY_EVENT
+static struct list_head *rb_list_head(struct list_head *list);
+static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+ void *event)
+{
+ struct buffer_page *page = cpu_buffer->commit_page;
+ struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page);
+ struct list_head *next;
+ long commit, write;
+ unsigned long addr = (unsigned long)event;
+ bool done = false;
+ int stop = 0;
+
+ /* Make sure the event exists and is not committed yet */
+ do {
+ if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
+ done = true;
+ commit = local_read(&page->page->commit);
+ write = local_read(&page->write);
+ if (addr >= (unsigned long)&page->page->data[commit] &&
+ addr < (unsigned long)&page->page->data[write])
+ return;
+
+ next = rb_list_head(page->list.next);
+ page = list_entry(next, struct buffer_page, list);
+ } while (!done);
+ WARN_ON_ONCE(1);
+}
+#else
+static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+ void *event)
+{
+}
+#endif
+
+
+static inline u64 rb_time_stamp(struct trace_buffer *buffer);
+
+/**
+ * ring_buffer_event_time_stamp - return the event's current time stamp
+ * @buffer: The buffer that the event is on
+ * @event: the event to get the time stamp of
+ *
+ * Note, this must be called after @event is reserved, and before it is
+ * committed to the ring buffer. And must be called from the same
+ * context where the event was reserved (normal, softirq, irq, etc).
+ *
+ * Returns the time stamp associated with the current event.
+ * If the event has an extended time stamp, then that is used as
+ * the time stamp to return.
+ * In the highly unlikely case that the event was nested more than
+ * the max nesting, then the write_stamp of the buffer is returned,
+ * otherwise current time is returned, but that really neither of
+ * the last two cases should ever happen.
+ */
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()];
+ unsigned int nest;
+ u64 ts;
+
+ /* If the event includes an absolute time, then just use that */
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
+ return rb_event_time_stamp(event);
+
+ nest = local_read(&cpu_buffer->committing);
+ verify_event(cpu_buffer, event);
+ if (WARN_ON_ONCE(!nest))
+ goto fail;
+
+ /* Read the current saved nesting level time stamp */
+ if (likely(--nest < MAX_NEST))
+ return cpu_buffer->event_stamp[nest];
+
+ /* Shouldn't happen, warn if it does */
+ WARN_ONCE(1, "nest (%d) greater than max", nest);
+
+ fail:
+ /* Can only fail on 32 bit */
+ if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
+ /* Screw it, just read the current time */
+ ts = rb_time_stamp(cpu_buffer->buffer);
+
+ return ts;
+}
+
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
@@ -994,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer)
return ts << DEBUG_SHIFT;
}
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer)
{
u64 time;
@@ -2710,6 +2796,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned length = info->length;
u64 delta = info->delta;
+ unsigned int nest = local_read(&cpu_buffer->committing) - 1;
+
+ if (!WARN_ON_ONCE(nest >= MAX_NEST))
+ cpu_buffer->event_stamp[nest] = info->ts;
/*
* If we need to add a timestamp, then we
@@ -2766,7 +2856,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event)
return 0;
case RINGBUF_TYPE_TIME_EXTEND:
- return ring_buffer_event_time_stamp(event);
+ return rb_event_time_stamp(event);
case RINGBUF_TYPE_TIME_STAMP:
return 0;
@@ -3064,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* is called before preempt_count() is updated, since the check will
* be on the NORMAL bit, the TRANSITION bit will then be set. If an
* NMI then comes in, it will set the NMI bit, but when the NMI code
- * does the trace_recursive_unlock() it will clear the TRANSTION bit
+ * does the trace_recursive_unlock() it will clear the TRANSITION bit
* and leave the NMI bit set. But this is fine, because the interrupt
* code that set the TRANSITION bit will then clear the NMI bit when it
* calls trace_recursive_unlock(). If another NMI comes in, it will
@@ -3212,13 +3302,13 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
switch (event->type_len) {
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
ts += delta;
pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
ts = delta;
pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
break;
@@ -3289,12 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
switch (event->type_len) {
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
ts += delta;
break;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
ts = delta;
break;
@@ -3451,7 +3541,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
info->after, ts)) {
/* Nothing came after this event between C and E */
info->delta = ts - info->after;
- info->ts = ts;
} else {
/*
* Interrupted between C and E:
@@ -3463,6 +3552,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
*/
info->delta = 0;
}
+ info->ts = ts;
info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
}
@@ -4256,12 +4346,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
cpu_buffer->read_stamp = delta;
return;
@@ -4286,12 +4376,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- delta = ring_buffer_event_time_stamp(event);
+ delta = rb_event_time_stamp(event);
iter->read_stamp = delta;
return;
@@ -4544,7 +4634,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
- *ts = ring_buffer_event_time_stamp(event);
+ *ts = rb_event_time_stamp(event);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4635,7 +4725,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
- *ts = ring_buffer_event_time_stamp(event);
+ *ts = rb_event_time_stamp(event);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -5021,6 +5111,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
rb_time_set(&cpu_buffer->write_stamp, 0);
rb_time_set(&cpu_buffer->before_stamp, 0);
+ memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp));
+
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index a4b4bbf8c3bf..0b15e975d2c2 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Test module for in-kernel sythetic event creation and generation.
+ * Test module for in-kernel synthetic event creation and generation.
*
* Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
*/
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 915fe8790f04..a21ef9cd2aae 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list)
* @filtered_pids: The list of pids to check
* @search_pid: The PID to find in @filtered_pids
*
- * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
*/
bool
trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
@@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids,
struct task_struct *task)
{
/*
- * If filterd_no_pids is not empty, and the task's pid is listed
+ * If filtered_no_pids is not empty, and the task's pid is listed
* in filtered_no_pids, then return true.
* Otherwise, if filtered_pids is empty, that means we can
* trace all tasks. If it has content, then only trace pids
@@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
(*pos)++;
- /* pid already is +1 of the actual prevous bit */
+ /* pid already is +1 of the actual previous bit */
pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
/* Return pid + 1 to allow zero to be represented */
@@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
if (!buf->buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ts = ring_buffer_time_stamp(buf->buffer);
ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
return ts;
@@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock);
* The content of events may become garbage if we allow other process consumes
* these events concurrently:
* A) the page of the consumed events may become a normal page
- * (not reader page) in ring buffer, and this page will be rewrited
+ * (not reader page) in ring buffer, and this page will be rewritten
* by events producer.
* B) The page of the consumed events may become a page for splice_read,
* and this page will be returned to system.
@@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
#undef C
#define C(a, b) b
-/* These must match the bit postions in trace_iterator_flags */
+/* These must match the bit positions in trace_iterator_flags */
static const char *trace_options[] = {
TRACE_FLAGS
NULL
@@ -2390,14 +2390,13 @@ static void tracing_stop_tr(struct trace_array *tr)
static int trace_save_cmdline(struct task_struct *tsk)
{
- unsigned pid, idx;
+ unsigned tpid, idx;
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(tsk->pid > PID_MAX_DEFAULT))
- return 0;
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*
* It's not the end of the world if we don't get
@@ -2408,26 +2407,15 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
- idx = savedcmd->map_pid_to_cmdline[tsk->pid];
+ idx = savedcmd->map_pid_to_cmdline[tpid];
if (idx == NO_CMDLINE_MAP) {
idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
- /*
- * Check whether the cmdline buffer at idx has a pid
- * mapped. We are going to overwrite that entry so we
- * need to clear the map_pid_to_cmdline. Otherwise we
- * would read the new comm for the old pid.
- */
- pid = savedcmd->map_cmdline_to_pid[idx];
- if (pid != NO_CMDLINE_MAP)
- savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
-
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
savedcmd->cmdline_idx = idx;
}
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
@@ -2438,6 +2426,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
+ int tpid;
if (!pid) {
strcpy(comm, "<idle>");
@@ -2449,16 +2438,16 @@ static void __trace_find_cmdline(int pid, char comm[])
return;
}
- if (pid > PID_MAX_DEFAULT) {
- strcpy(comm, "<...>");
- return;
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
}
-
- map = savedcmd->map_pid_to_cmdline[pid];
- if (map != NO_CMDLINE_MAP)
- strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
- else
- strcpy(comm, "<...>");
+ strcpy(comm, "<...>");
}
void trace_find_cmdline(int pid, char comm[])
@@ -2737,12 +2726,13 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
unsigned int trace_ctx)
{
struct ring_buffer_event *entry;
+ struct trace_array *tr = trace_file->tr;
int val;
- *current_rb = trace_file->tr->array_buffer.buffer;
+ *current_rb = tr->array_buffer.buffer;
- if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
- (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+ if (!tr->no_filter_buffering_ref &&
+ (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
(entry = this_cpu_read(trace_buffered_event))) {
/* Try to use the per cpu buffer first */
val = this_cpu_inc_return(trace_buffered_event_cnt);
@@ -3116,6 +3106,40 @@ static void ftrace_trace_userstack(struct trace_array *tr,
#endif /* CONFIG_STACKTRACE */
+static inline void
+func_repeats_set_delta_ts(struct func_repeats_entry *entry,
+ unsigned long long delta)
+{
+ entry->bottom_delta_ts = delta & U32_MAX;
+ entry->top_delta_ts = (delta >> 32);
+}
+
+void trace_last_func_repeats(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx)
+{
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct func_repeats_entry *entry;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS,
+ sizeof(*entry), trace_ctx);
+ if (!event)
+ return;
+
+ delta = ring_buffer_event_time_stamp(buffer, event) -
+ last_info->ts_last_call;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = last_info->ip;
+ entry->parent_ip = last_info->parent_ip;
+ entry->count = last_info->count;
+ func_repeats_set_delta_ts(entry, delta);
+
+ __buffer_unlock_commit(buffer, event);
+}
+
/* created for use with alloc_percpu */
struct trace_buffer_struct {
int nesting;
@@ -3368,7 +3392,7 @@ int trace_array_vprintk(struct trace_array *tr,
* buffer (use trace_printk() for that), as writing into the top level
* buffer should only have events that can be individually disabled.
* trace_printk() is only used for debugging a kernel, and should not
- * be ever encorporated in normal use.
+ * be ever incorporated in normal use.
*
* trace_array_printk() can be used, as it will not add noise to the
* top level tracing buffer.
@@ -3562,6 +3586,227 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
return tmp;
}
+/* Returns true if the string is safe to dereference from an event */
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+{
+ unsigned long addr = (unsigned long)str;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+
+ /* OK if part of the event data */
+ if ((addr >= (unsigned long)iter->ent) &&
+ (addr < (unsigned long)iter->ent + iter->ent_size))
+ return true;
+
+ /* OK if part of the temp seq buffer */
+ if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
+ (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+ return true;
+
+ /* Core rodata can not be freed */
+ if (is_kernel_rodata(addr))
+ return true;
+
+ if (trace_is_tracepoint_string(str))
+ return true;
+
+ /*
+ * Now this could be a module event, referencing core module
+ * data, which is OK.
+ */
+ if (!iter->ent)
+ return false;
+
+ trace_event = ftrace_find_event(iter->ent->type);
+ if (!trace_event)
+ return false;
+
+ event = container_of(trace_event, struct trace_event_call, event);
+ if (!event->mod)
+ return false;
+
+ /* Would rather have rodata, but this will suffice */
+ if (within_module_core(addr, event->mod))
+ return true;
+
+ return false;
+}
+
+static const char *show_buffer(struct trace_seq *s)
+{
+ struct seq_buf *seq = &s->seq;
+
+ seq_buf_terminate(seq);
+
+ return seq->buffer;
+}
+
+static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+
+static int test_can_verify_check(const char *fmt, ...)
+{
+ char buf[16];
+ va_list ap;
+ int ret;
+
+ /*
+ * The verifier is dependent on vsnprintf() modifies the va_list
+ * passed to it, where it is sent as a reference. Some architectures
+ * (like x86_32) passes it by value, which means that vsnprintf()
+ * does not modify the va_list passed to it, and the verifier
+ * would then need to be able to understand all the values that
+ * vsnprintf can use. If it is passed by value, then the verifier
+ * is disabled.
+ */
+ va_start(ap, fmt);
+ vsnprintf(buf, 16, "%d", ap);
+ ret = va_arg(ap, int);
+ va_end(ap);
+
+ return ret;
+}
+
+static void test_can_verify(void)
+{
+ if (!test_can_verify_check("%d %d", 0, 1)) {
+ pr_info("trace event string verifier disabled\n");
+ static_branch_inc(&trace_no_verify);
+ }
+}
+
+/**
+ * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * @iter: The iterator that holds the seq buffer and the event being printed
+ * @fmt: The format used to print the event
+ * @ap: The va_list holding the data to print from @fmt.
+ *
+ * This writes the data into the @iter->seq buffer using the data from
+ * @fmt and @ap. If the format has a %s, then the source of the string
+ * is examined to make sure it is safe to print, otherwise it will
+ * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
+ * pointer.
+ */
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+ va_list ap)
+{
+ const char *p = fmt;
+ const char *str;
+ int i, j;
+
+ if (WARN_ON_ONCE(!fmt))
+ return;
+
+ if (static_branch_unlikely(&trace_no_verify))
+ goto print;
+
+ /* Don't bother checking when doing a ftrace_dump() */
+ if (iter->fmt == static_fmt_buf)
+ goto print;
+
+ while (*p) {
+ bool star = false;
+ int len = 0;
+
+ j = 0;
+
+ /* We only care about %s and variants */
+ for (i = 0; p[i]; i++) {
+ if (i + 1 >= iter->fmt_size) {
+ /*
+ * If we can't expand the copy buffer,
+ * just print it.
+ */
+ if (!trace_iter_expand_format(iter))
+ goto print;
+ }
+
+ if (p[i] == '\\' && p[i+1]) {
+ i++;
+ continue;
+ }
+ if (p[i] == '%') {
+ /* Need to test cases like %08.*s */
+ for (j = 1; p[i+j]; j++) {
+ if (isdigit(p[i+j]) ||
+ p[i+j] == '.')
+ continue;
+ if (p[i+j] == '*') {
+ star = true;
+ continue;
+ }
+ break;
+ }
+ if (p[i+j] == 's')
+ break;
+ star = false;
+ }
+ j = 0;
+ }
+ /* If no %s found then just print normally */
+ if (!p[i])
+ break;
+
+ /* Copy up to the %s, and print that */
+ strncpy(iter->fmt, p, i);
+ iter->fmt[i] = '\0';
+ trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+
+ if (star)
+ len = va_arg(ap, int);
+
+ /* The ap now points to the string data of the %s */
+ str = va_arg(ap, const char *);
+
+ /*
+ * If you hit this warning, it is likely that the
+ * trace event in question used %s on a string that
+ * was saved at the time of the event, but may not be
+ * around when the trace is read. Use __string(),
+ * __assign_str() and __get_str() helpers in the TRACE_EVENT()
+ * instead. See samples/trace_events/trace-events-sample.h
+ * for reference.
+ */
+ if (WARN_ONCE(!trace_safe_str(iter, str),
+ "fmt: '%s' current_buffer: '%s'",
+ fmt, show_buffer(&iter->seq))) {
+ int ret;
+
+ /* Try to safely read the string */
+ if (star) {
+ if (len + 1 > iter->fmt_size)
+ len = iter->fmt_size - 1;
+ if (len < 0)
+ len = 0;
+ ret = copy_from_kernel_nofault(iter->fmt, str, len);
+ iter->fmt[len] = 0;
+ star = false;
+ } else {
+ ret = strncpy_from_kernel_nofault(iter->fmt, str,
+ iter->fmt_size);
+ }
+ if (ret < 0)
+ trace_seq_printf(&iter->seq, "(0x%px)", str);
+ else
+ trace_seq_printf(&iter->seq, "(0x%px:%s)",
+ str, iter->fmt);
+ str = "[UNSAFE-MEMORY]";
+ strcpy(iter->fmt, "%s");
+ } else {
+ strncpy(iter->fmt, p + i, j + 1);
+ iter->fmt[j+1] = '\0';
+ }
+ if (star)
+ trace_seq_printf(&iter->seq, iter->fmt, len, str);
+ else
+ trace_seq_printf(&iter->seq, iter->fmt, str);
+
+ p += i + j + 1;
+ }
+ print:
+ if (*p)
+ trace_seq_vprintf(&iter->seq, p, ap);
+}
+
const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
{
const char *p, *new_fmt;
@@ -6768,7 +7013,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
/* do not add \n before testing triggers, but add \0 */
entry->buf[cnt] = '\0';
- tt = event_triggers_call(tr->trace_marker_file, entry, event);
+ tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event);
}
if (entry->buf[cnt - 1] != '\n') {
@@ -6976,31 +7221,34 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
return ret;
}
-int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe)
+{
+ if (rbe == this_cpu_read(trace_buffered_event))
+ return ring_buffer_time_stamp(buffer);
+
+ return ring_buffer_event_time_stamp(buffer, rbe);
+}
+
+/*
+ * Set or disable using the per CPU trace_buffer_event when possible.
+ */
+int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
int ret = 0;
mutex_lock(&trace_types_lock);
- if (abs && tr->time_stamp_abs_ref++)
+ if (set && tr->no_filter_buffering_ref++)
goto out;
- if (!abs) {
- if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+ if (!set) {
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
ret = -EINVAL;
goto out;
}
- if (--tr->time_stamp_abs_ref)
- goto out;
+ --tr->no_filter_buffering_ref;
}
-
- ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->max_buffer.buffer)
- ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
-#endif
out:
mutex_unlock(&trace_types_lock);
@@ -7336,11 +7584,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
* @cmd: The tracing command that caused the error
* @str: The string to position the caret at within @cmd
*
- * Finds the position of the first occurence of @str within @cmd. The
+ * Finds the position of the first occurrence of @str within @cmd. The
* return value can be passed to tracing_log_err() for caret placement
* within @cmd.
*
- * Returns the index within @cmd of the first occurence of @str or 0
+ * Returns the index within @cmd of the first occurrence of @str or 0
* if @str was not found.
*/
unsigned int err_pos(char *cmd, const char *str)
@@ -7890,7 +8138,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
t, usec_rem);
- t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer));
usec_rem = do_div(t, USEC_PER_SEC);
trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
} else {
@@ -7899,7 +8147,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
trace_seq_printf(s, "now ts: %llu\n",
- ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ ring_buffer_time_stamp(trace_buf->buffer));
}
cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
@@ -8906,6 +9154,7 @@ static int __remove_instance(struct trace_array *tr)
ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove(tr->dir);
+ free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
for (i = 0; i < tr->nr_topts; i++) {
@@ -9123,7 +9372,7 @@ int tracing_init_dentry(void)
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
* the tracefs file system there, so older tools still
- * work with the newer kerenl.
+ * work with the newer kernel.
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
@@ -9676,6 +9925,8 @@ __init static int tracer_alloc_buffers(void)
register_snapshot_cmd();
+ test_can_verify();
+
return 0;
out_free_savedcmd:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a6446c03cfbc..cd80d046c7a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -45,6 +45,7 @@ enum trace_type {
TRACE_BPUTS,
TRACE_HWLAT,
TRACE_RAW_DATA,
+ TRACE_FUNC_REPEATS,
__TRACE_LAST_TYPE,
};
@@ -262,6 +263,17 @@ struct cond_snapshot {
};
/*
+ * struct trace_func_repeats - used to keep track of the consecutive
+ * (on the same CPU) calls of a single function.
+ */
+struct trace_func_repeats {
+ unsigned long ip;
+ unsigned long parent_ip;
+ unsigned long count;
+ u64 ts_last_call;
+};
+
+/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
* They have on/off state as well:
@@ -352,11 +364,12 @@ struct trace_array {
/* function tracing enabled */
int function_enabled;
#endif
- int time_stamp_abs_ref;
+ int no_filter_buffering_ref;
struct list_head hist_vars;
#ifdef CONFIG_TRACER_SNAPSHOT
struct cond_snapshot *cond_snapshot;
#endif
+ struct trace_func_repeats __percpu *last_func_repeats;
};
enum {
@@ -372,7 +385,8 @@ extern int tracing_check_open_get_tr(struct trace_array *tr);
extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);
-extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
+extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
@@ -441,6 +455,8 @@ extern void __ftrace_bad_type(void);
TRACE_GRAPH_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
+ IF_ASSIGN(var, ent, struct func_repeats_entry, \
+ TRACE_FUNC_REPEATS); \
__ftrace_bad_type(); \
} while (0)
@@ -581,7 +597,10 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
struct ring_buffer_event *event);
+bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+ va_list ap);
int trace_empty(struct trace_iterator *iter);
@@ -676,6 +695,10 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
}
#endif /* CONFIG_STACKTRACE */
+void trace_last_func_repeats(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx);
+
extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
@@ -1329,7 +1352,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
unsigned long eflags = file->flags;
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
- *tt = event_triggers_call(file, entry, event);
+ *tt = event_triggers_call(file, buffer, entry, event);
if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
(unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1343,7 +1366,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
/**
* event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
@@ -1370,7 +1393,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
/**
* event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
* @buffer: The ring buffer that the event is being written to
* @event: The event meta data in the ring buffer
* @entry: The event itself
@@ -1626,7 +1649,7 @@ extern int register_trigger_hist_enable_disable_cmds(void);
*/
struct event_trigger_ops {
void (*func)(struct event_trigger_data *data,
- void *rec,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe);
int (*init)(struct event_trigger_ops *ops,
struct event_trigger_data *data);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aaf6793ededa..c1637f90c8a3 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
- u64 now;
+ u64 now, prev_time;
raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = sched_clock_cpu(this_cpu);
+
/*
- * If in an NMI context then dont risk lockups and return the
- * cpu_clock() time:
+ * The global clock "guarantees" that the events are ordered
+ * between CPUs. But if two events on two different CPUS call
+ * trace_clock_global at roughly the same time, it really does
+ * not matter which one gets the earlier time. Just make sure
+ * that the same CPU will always show a monotonic clock.
+ *
+ * Use a read memory barrier to get the latest written
+ * time that was recorded.
*/
- if (unlikely(in_nmi()))
- goto out;
+ smp_rmb();
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ now = sched_clock_cpu(this_cpu);
- arch_spin_lock(&trace_clock_struct.lock);
+ /* Make sure that now is always greater than prev_time */
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time + 1;
/*
- * TODO: if this happens often then maybe we should reset
- * my_scd->clock to prev_time+1, to make sure
- * we start ticking with the local clock from now on?
+ * If in an NMI context then dont risk lockups and simply return
+ * the current time.
*/
- if ((s64)(now - trace_clock_struct.prev_time) < 0)
- now = trace_clock_struct.prev_time + 1;
+ if (unlikely(in_nmi()))
+ goto out;
- trace_clock_struct.prev_time = now;
+ /* Tracing can cause strange recursion, always use a try lock */
+ if (arch_spin_trylock(&trace_clock_struct.lock)) {
+ /* Reread prev_time in case it was already updated */
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time + 1;
- arch_spin_unlock(&trace_clock_struct.lock);
+ trace_clock_struct.prev_time = now;
+ /* The unlock acts as the wmb for the above rmb */
+ arch_spin_unlock(&trace_clock_struct.lock);
+ }
out:
raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4547ac59da61..251c819cf0c5 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -338,3 +338,25 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__entry->nmi_total_ts,
__entry->nmi_count)
);
+
+#define FUNC_REPEATS_GET_DELTA_TS(entry) \
+ (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \
+
+FTRACE_ENTRY(func_repeats, func_repeats_entry,
+
+ TRACE_FUNC_REPEATS,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( unsigned long, parent_ip )
+ __field( u16 , count )
+ __field( u16 , top_delta_ts )
+ __field( u32 , bottom_delta_ts )
+ ),
+
+ F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)",
+ (void *)__entry->ip,
+ (void *)__entry->parent_ip,
+ __entry->count,
+ FUNC_REPEATS_GET_DELTA_TS(__entry))
+);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 288ad2c274fb..03be4435d103 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
/*
* Force it to be aligned to unsigned long to avoid misaligned accesses
- * suprises
+ * surprises
*/
typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
perf_trace_t;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a3563afd412d..80e96989770e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -217,6 +217,214 @@ int trace_event_get_offsets(struct trace_event_call *call)
return tail->offset + tail->size;
}
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+ struct trace_event_fields *field = call->class->fields_array;
+ const char *array_descriptor;
+ const char *p = fmt;
+ int len;
+
+ if (!(len = str_has_prefix(fmt, "REC->")))
+ return false;
+ fmt += len;
+ for (p = fmt; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+ break;
+ }
+ len = p - fmt;
+
+ for (; field->type; field++) {
+ if (strncmp(field->name, fmt, len) ||
+ field->name[len])
+ continue;
+ array_descriptor = strchr(field->type, '[');
+ /* This is an array and is OK to dereference. */
+ return array_descriptor != NULL;
+ }
+ return false;
+}
+
+/*
+ * Examine the print fmt of the event looking for unsafe dereference
+ * pointers using %p* that could be recorded in the trace event and
+ * much later referenced after the pointer was freed. Dereferencing
+ * pointers are OK, if it is dereferenced into the event itself.
+ */
+static void test_event_printk(struct trace_event_call *call)
+{
+ u64 dereference_flags = 0;
+ bool first = true;
+ const char *fmt, *c, *r, *a;
+ int parens = 0;
+ char in_quote = 0;
+ int start_arg = 0;
+ int arg = 0;
+ int i;
+
+ fmt = call->print_fmt;
+
+ if (!fmt)
+ return;
+
+ for (i = 0; fmt[i]; i++) {
+ switch (fmt[i]) {
+ case '\\':
+ i++;
+ if (!fmt[i])
+ return;
+ continue;
+ case '"':
+ case '\'':
+ /*
+ * The print fmt starts with a string that
+ * is processed first to find %p* usage,
+ * then after the first string, the print fmt
+ * contains arguments that are used to check
+ * if the dereferenced %p* usage is safe.
+ */
+ if (first) {
+ if (fmt[i] == '\'')
+ continue;
+ if (in_quote) {
+ arg = 0;
+ first = false;
+ /*
+ * If there was no %p* uses
+ * the fmt is OK.
+ */
+ if (!dereference_flags)
+ return;
+ }
+ }
+ if (in_quote) {
+ if (in_quote == fmt[i])
+ in_quote = 0;
+ } else {
+ in_quote = fmt[i];
+ }
+ continue;
+ case '%':
+ if (!first || !in_quote)
+ continue;
+ i++;
+ if (!fmt[i])
+ return;
+ switch (fmt[i]) {
+ case '%':
+ continue;
+ case 'p':
+ /* Find dereferencing fields */
+ switch (fmt[i + 1]) {
+ case 'B': case 'R': case 'r':
+ case 'b': case 'M': case 'm':
+ case 'I': case 'i': case 'E':
+ case 'U': case 'V': case 'N':
+ case 'a': case 'd': case 'D':
+ case 'g': case 't': case 'C':
+ case 'O': case 'f':
+ if (WARN_ONCE(arg == 63,
+ "Too many args for event: %s",
+ trace_event_name(call)))
+ return;
+ dereference_flags |= 1ULL << arg;
+ }
+ break;
+ default:
+ {
+ bool star = false;
+ int j;
+
+ /* Increment arg if %*s exists. */
+ for (j = 0; fmt[i + j]; j++) {
+ if (isdigit(fmt[i + j]) ||
+ fmt[i + j] == '.')
+ continue;
+ if (fmt[i + j] == '*') {
+ star = true;
+ continue;
+ }
+ if ((fmt[i + j] == 's') && star)
+ arg++;
+ break;
+ }
+ break;
+ } /* default */
+
+ } /* switch */
+ arg++;
+ continue;
+ case '(':
+ if (in_quote)
+ continue;
+ parens++;
+ continue;
+ case ')':
+ if (in_quote)
+ continue;
+ parens--;
+ if (WARN_ONCE(parens < 0,
+ "Paren mismatch for event: %s\narg='%s'\n%*s",
+ trace_event_name(call),
+ fmt + start_arg,
+ (i - start_arg) + 5, "^"))
+ return;
+ continue;
+ case ',':
+ if (in_quote || parens)
+ continue;
+ i++;
+ while (isspace(fmt[i]))
+ i++;
+ start_arg = i;
+ if (!(dereference_flags & (1ULL << arg)))
+ goto next_arg;
+
+ /* Find the REC-> in the argument */
+ c = strchr(fmt + i, ',');
+ r = strstr(fmt + i, "REC->");
+ if (r && (!c || r < c)) {
+ /*
+ * Addresses of events on the buffer,
+ * or an array on the buffer is
+ * OK to dereference.
+ * There's ways to fool this, but
+ * this is to catch common mistakes,
+ * not malicious code.
+ */
+ a = strchr(fmt + i, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+ next_arg:
+ i--;
+ arg++;
+ }
+ }
+
+ /*
+ * If you triggered the below warning, the trace event reported
+ * uses an unsafe dereference pointer %p*. As the data stored
+ * at the trace event time may no longer exist when the trace
+ * event is printed, dereferencing to the original source is
+ * unsafe. The source of the dereference must be copied into the
+ * event itself, and the dereference must access the copy instead.
+ */
+ if (WARN_ON_ONCE(dereference_flags)) {
+ arg = 1;
+ while (!(dereference_flags & 1)) {
+ dereference_flags >>= 1;
+ arg++;
+ }
+ pr_warn("event %s has unsafe dereference of argument %d\n",
+ trace_event_name(call), arg);
+ pr_warn("print_fmt: %s\n", fmt);
+ }
+}
+
int trace_event_raw_init(struct trace_event_call *call)
{
int id;
@@ -225,6 +433,8 @@ int trace_event_raw_init(struct trace_event_call *call)
if (!id)
return -ENODEV;
+ test_event_printk(call);
+
return 0;
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
@@ -2436,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
/*
- * Since calls are grouped by systems, the likelyhood that the
+ * Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
* previous call is high. As an optimization, we skip searching
* for a map[] that matches the call's system if the last call
@@ -2496,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
}
/*
- * Just create a decriptor for early init. A descriptor is required
+ * Just create a descriptor for early init. A descriptor is required
* for enabling events at boot. We want to enable events before
* the filesystem is initialized.
*/
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e91259f6a722..c9124038b140 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -256,7 +256,7 @@ enum {
* is "&&" we don't call update_preds(). Instead continue to "c". As the
* next token after "c" is not "&&" but the end of input, we first process the
* "&&" by calling update_preds() for the "&&" then we process the "||" by
- * callin updates_preds() with the values for processing "||".
+ * calling updates_preds() with the values for processing "||".
*
* What does that mean? What update_preds() does is to first save the "target"
* of the program entry indexed by the current program entry's "target"
@@ -296,7 +296,7 @@ enum {
* and "FALSE" the program entry after that, we are now done with the first
* pass.
*
- * Making the above "a || b && c" have a progam of:
+ * Making the above "a || b && c" have a program of:
* prog[0] = { "a", 1, 2 }
* prog[1] = { "b", 0, 2 }
* prog[2] = { "c", 0, 3 }
@@ -390,7 +390,7 @@ enum {
* F: return FALSE
*
* As "r = a; if (!r) goto n5;" is obviously the same as
- * "if (!a) goto n5;" without doing anything we can interperate the
+ * "if (!a) goto n5;" without doing anything we can interpret the
* program as:
* n1: if (!a) goto n5;
* n2: if (!b) goto n5;
@@ -1693,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
/**
* create_filter - create a filter for a trace_event_call
+ * @tr: the trace array associated with these events
* @call: trace_event_call to create a filter for
* @filter_str: filter string
* @set_str: remember @filter_str and enable detailed error in filter
@@ -1741,8 +1742,8 @@ int create_event_filter(struct trace_array *tr,
}
/**
- * create_system_filter - create a filter for an event_subsystem
- * @system: event_subsystem to create a filter for
+ * create_system_filter - create a filter for an event subsystem
+ * @dir: the descriptor for the subsystem directory
* @filter_str: filter string
* @filterp: out param for created filter (always updated on return)
*
@@ -1750,7 +1751,6 @@ int create_event_filter(struct trace_array *tr,
* and always remembers @filter_str.
*/
static int create_system_filter(struct trace_subsystem_dir *dir,
- struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
struct filter_parse_error *pe = NULL;
@@ -1758,13 +1758,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
err = create_filter_start(filter_str, true, &pe, filterp);
if (!err) {
- err = process_system_preds(dir, tr, pe, filter_str);
+ err = process_system_preds(dir, dir->tr, pe, filter_str);
if (!err) {
/* System filters just show a default message */
kfree((*filterp)->filter_string);
(*filterp)->filter_string = NULL;
} else {
- append_filter_err(tr, pe, *filterp);
+ append_filter_err(dir->tr, pe, *filterp);
}
}
create_filter_finish(pe);
@@ -1852,7 +1852,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
goto out_unlock;
}
- err = create_system_filter(dir, tr, filter_string, &filter);
+ err = create_system_filter(dir, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 39ebe1826fc3..c1abd63f1d6c 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -81,6 +81,7 @@ struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event);
@@ -153,6 +154,7 @@ struct hist_field {
static u64 hist_field_none(struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field,
static u64 hist_field_counter(struct hist_field *field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field,
static u64 hist_field_string(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field,
static u64 hist_field_dynstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field,
static u64 hist_field_pstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field,
static u64 hist_field_log2(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, elt, rbe, event);
+ u64 val = operand->fn(operand, elt, buffer, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
static u64 hist_field_plus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, rbe, event);
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
return val1 + val2;
}
static u64 hist_field_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand1 = hist_field->operands[0];
struct hist_field *operand2 = hist_field->operands[1];
- u64 val1 = operand1->fn(operand1, elt, rbe, event);
- u64 val2 = operand2->fn(operand2, elt, rbe, event);
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
return val1 - val2;
}
static u64 hist_field_unary_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_field *operand = hist_field->operands[0];
- s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+ s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
u64 val = (u64)-sval;
return val;
@@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
#define DEFINE_HIST_FIELD_FN(type) \
static u64 hist_field_##type(struct hist_field *hist_field, \
struct tracing_map_elt *elt, \
+ struct trace_buffer *buffer, \
struct ring_buffer_event *rbe, \
void *event) \
{ \
@@ -380,7 +391,8 @@ struct hist_trigger_data {
struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals);
@@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
}
static void action_trace(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
@@ -624,13 +637,14 @@ struct hist_var_data {
static u64 hist_field_timestamp(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
struct hist_trigger_data *hist_data = hist_field->hist_data;
struct trace_array *tr = hist_data->event_file->tr;
- u64 ts = ring_buffer_event_time_stamp(rbe);
+ u64 ts = ring_buffer_event_time_stamp(buffer, rbe);
if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
ts = ns2usecs(ts);
@@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field,
static u64 hist_field_cpu(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
static u64 hist_field_var_ref(struct hist_field *hist_field,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *event)
{
@@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data,
}
static inline void __update_field_vars(struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *rec,
struct field_var **field_vars,
@@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
struct hist_field *var = field_var->var;
struct hist_field *val = field_var->val;
- var_val = val->fn(val, elt, rbe, rec);
+ var_val = val->fn(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
if (val->flags & HIST_FIELD_FL_STRING) {
@@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
static void update_field_vars(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
struct ring_buffer_event *rbe,
void *rec)
{
- __update_field_vars(elt, rbe, rec, hist_data->field_vars,
+ __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars,
hist_data->n_field_vars, 0);
}
static void save_track_data_vars(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
- __update_field_vars(elt, rbe, rec, hist_data->save_vars,
+ __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars,
hist_data->n_save_vars, hist_data->n_field_var_str);
}
@@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data,
}
static void save_track_data(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
if (data->track_data.save_data)
- data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ data->track_data.save_data(hist_data, elt, buffer, rec, rbe,
+ key, data, var_ref_vals);
}
static bool check_track_val(struct tracing_map_elt *elt,
@@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
}
static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data,
u64 *var_ref_vals)
@@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
return false;
}
static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data,
u64 *var_ref_vals) {}
@@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m,
}
static void ontrack_action(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
@@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data,
if (check_track_val(elt, data, var_val)) {
save_track_val(hist_data, elt, data, var_val);
- save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ save_track_data(hist_data, elt, buffer, rec, rbe,
+ key, data, var_ref_vals);
}
}
@@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits,
}
static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe,
u64 *var_ref_vals)
{
@@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
@@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
for_each_hist_key_field(i, hist_data) {
hist_field = hist_data->fields[i];
if (hist_field->flags & HIST_FIELD_FL_VAR) {
- hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
var_idx = hist_field->var.idx;
tracing_map_set_var(elt, var_idx, hist_val);
}
}
- update_field_vars(hist_data, elt, rbe, rec);
+ update_field_vars(hist_data, elt, buffer, rbe, rec);
}
static inline void add_to_key(char *compound_key, void *key,
@@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key,
static void
hist_trigger_actions(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe, void *key,
u64 *var_ref_vals)
{
@@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->n_actions; i++) {
data = hist_data->actions[i];
- data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals);
}
}
-static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+static void event_hist_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
@@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
HIST_STACKTRACE_SKIP);
key = entries;
} else {
- field_contents = key_field->fn(key_field, elt, rbe, rec);
+ field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
if (!elt)
return;
- hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5456,7 +5484,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
goto out;
}
- tracing_set_time_stamp_abs(file->tr, true);
+ tracing_set_filter_buffering(file->tr, true);
}
if (named_data)
@@ -5564,7 +5592,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (hist_data->enable_timestamps) {
if (!hist_data->remove || unregistered)
- tracing_set_time_stamp_abs(file->tr, false);
+ tracing_set_filter_buffering(file->tr, false);
}
}
@@ -5611,7 +5639,7 @@ static void hist_unreg_all(struct trace_event_file *file)
update_cond_flag(file);
if (hist_data->enable_timestamps)
- tracing_set_time_stamp_abs(file->tr, false);
+ tracing_set_filter_buffering(file->tr, false);
if (test->ops->free)
test->ops->free(test->ops, test);
}
@@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void)
}
static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
}
static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!data->count)
@@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- hist_enable_trigger(data, rec, event);
+ hist_enable_trigger(data, buffer, rec, event);
}
static struct event_trigger_ops hist_enable_trigger_ops = {
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 8d71e6c83f10..2ac75eb6aa86 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se)
/**
* synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
+ * @event_name: The name of the new synthetic event
*
* Delete a synthetic event that was created with synth_event_create().
*
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f725802160c0..b8bfa8505b7b 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data)
* any trigger that should be deferred, ETT_NONE if nothing to defer.
*/
enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec,
+event_triggers_call(struct trace_event_file *file,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct event_trigger_data *data;
@@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
if (data->paused)
continue;
if (!rec) {
- data->ops->func(data, rec, event);
+ data->ops->func(data, buffer, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->func(data, rec, event);
+ data->ops->func(data, buffer, rec, event);
}
return tt;
}
@@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data, NULL, NULL);
+ data->ops->func(data, NULL, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data)
}
static void
-traceon_trigger(struct event_trigger_data *data, void *rec,
+traceon_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (tracing_is_on())
@@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec,
}
static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec,
+traceon_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (tracing_is_on())
@@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec,
}
static void
-traceoff_trigger(struct event_trigger_data *data, void *rec,
+traceoff_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!tracing_is_on())
@@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec,
}
static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+traceoff_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!tracing_is_on())
@@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = {
#ifdef CONFIG_TRACER_SNAPSHOT
static void
-snapshot_trigger(struct event_trigger_data *data, void *rec,
+snapshot_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct trace_event_file *file = data->private_data;
@@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec,
}
static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+snapshot_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!data->count)
@@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- snapshot_trigger(data, rec, event);
+ snapshot_trigger(data, buffer, rec, event);
}
static int
@@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif
static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
trace_dump_stack(STACK_SKIP);
}
static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
if (!data->count)
@@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- stacktrace_trigger(data, rec, event);
+ stacktrace_trigger(data, buffer, rec, event);
}
static int
@@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
}
static void
-event_enable_trigger(struct event_trigger_data *data, void *rec,
+event_enable_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec,
}
static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+event_enable_count_trigger(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data, rec, event);
+ event_enable_trigger(data, buffer, rec, event);
}
int event_enable_trigger_print(struct seq_file *m,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f93723ca66bc..1f0e63f5d1f9 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -27,13 +27,28 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
+function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
+function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs);
static struct tracer_flags func_flags;
/* Our option */
enum {
- TRACE_FUNC_OPT_STACK = 0x1,
+
+ TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */
+ TRACE_FUNC_OPT_STACK = 0x1,
+ TRACE_FUNC_OPT_NO_REPEATS = 0x2,
+
+ /* Update this to next highest bit. */
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x4
};
+#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
+
int ftrace_allocate_ftrace_ops(struct trace_array *tr)
{
struct ftrace_ops *ops;
@@ -86,6 +101,34 @@ void ftrace_destroy_function_files(struct trace_array *tr)
ftrace_free_ftrace_ops(tr);
}
+static ftrace_func_t select_trace_function(u32 flags_val)
+{
+ switch (flags_val & TRACE_FUNC_OPT_MASK) {
+ case TRACE_FUNC_NO_OPTS:
+ return function_trace_call;
+ case TRACE_FUNC_OPT_STACK:
+ return function_stack_trace_call;
+ case TRACE_FUNC_OPT_NO_REPEATS:
+ return function_no_repeats_trace_call;
+ case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS:
+ return function_stack_no_repeats_trace_call;
+ default:
+ return NULL;
+ }
+}
+
+static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
+{
+ if (!tr->last_func_repeats &&
+ (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) {
+ tr->last_func_repeats = alloc_percpu(struct trace_func_repeats);
+ if (!tr->last_func_repeats)
+ return false;
+ }
+
+ return true;
+}
+
static int function_trace_init(struct trace_array *tr)
{
ftrace_func_t func;
@@ -97,12 +140,12 @@ static int function_trace_init(struct trace_array *tr)
if (!tr->ops)
return -ENOMEM;
- /* Currently only the global instance can do stack tracing */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
- func_flags.val & TRACE_FUNC_OPT_STACK)
- func = function_stack_trace_call;
- else
- func = function_trace_call;
+ func = select_trace_function(func_flags.val);
+ if (!func)
+ return -EINVAL;
+
+ if (!handle_func_repeats(tr, func_flags.val))
+ return -ENOMEM;
ftrace_init_array_ops(tr, func);
@@ -205,15 +248,137 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
local_irq_restore(flags);
}
+static inline bool is_repeat_check(struct trace_array *tr,
+ struct trace_func_repeats *last_info,
+ unsigned long ip, unsigned long parent_ip)
+{
+ if (last_info->ip == ip &&
+ last_info->parent_ip == parent_ip &&
+ last_info->count < U16_MAX) {
+ last_info->ts_last_call =
+ ring_buffer_time_stamp(tr->array_buffer.buffer);
+ last_info->count++;
+ return true;
+ }
+
+ return false;
+}
+
+static inline void process_repeats(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ struct trace_func_repeats *last_info,
+ unsigned int trace_ctx)
+{
+ if (last_info->count) {
+ trace_last_func_repeats(tr, last_info, trace_ctx);
+ last_info->count = 0;
+ }
+
+ last_info->ip = ip;
+ last_info->parent_ip = parent_ip;
+}
+
+static void
+function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ struct trace_func_repeats *last_info;
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned int trace_ctx;
+ unsigned long flags;
+ int bit;
+ int cpu;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ preempt_disable_notrace();
+
+ cpu = smp_processor_id();
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ if (atomic_read(&data->disabled))
+ goto out;
+
+ /*
+ * An interrupt may happen at any place here. But as far as I can see,
+ * the only damage that this can cause is to mess up the repetition
+ * counter without valuable data being lost.
+ * TODO: think about a solution that is better than just hoping to be
+ * lucky.
+ */
+ last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ if (is_repeat_check(tr, last_info, ip, parent_ip))
+ goto out;
+
+ local_save_flags(flags);
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
+
+out:
+ ftrace_test_recursion_unlock(bit);
+ preempt_enable_notrace();
+}
+
+static void
+function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ struct trace_func_repeats *last_info;
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ unsigned int trace_ctx;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ if (is_repeat_check(tr, last_info, ip, parent_ip))
+ goto out;
+
+ trace_ctx = tracing_gen_ctx_flags(flags);
+ process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
+
+ trace_function(tr, ip, parent_ip, trace_ctx);
+ __trace_stack(tr, trace_ctx, STACK_SKIP);
+ }
+
+ out:
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
+ { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
{ } /* Always set a last empty entry */
};
static struct tracer_flags func_flags = {
- .val = 0, /* By default: all flags disabled */
+ .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */
.opts = func_opts
};
@@ -235,30 +400,32 @@ static struct tracer function_trace;
static int
func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
- switch (bit) {
- case TRACE_FUNC_OPT_STACK:
- /* do nothing if already set */
- if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
- break;
-
- /* We can change this flag when not running. */
- if (tr->current_trace != &function_trace)
- break;
+ ftrace_func_t func;
+ u32 new_flags;
- unregister_ftrace_function(tr->ops);
+ /* Do nothing if already set. */
+ if (!!set == !!(func_flags.val & bit))
+ return 0;
- if (set) {
- tr->ops->func = function_stack_trace_call;
- register_ftrace_function(tr->ops);
- } else {
- tr->ops->func = function_trace_call;
- register_ftrace_function(tr->ops);
- }
+ /* We can change this flag only when not running. */
+ if (tr->current_trace != &function_trace)
+ return 0;
- break;
- default:
+ new_flags = (func_flags.val & ~bit) | (set ? bit : 0);
+ func = select_trace_function(new_flags);
+ if (!func)
return -EINVAL;
- }
+
+ /* Check if there's anything to change. */
+ if (tr->ops->func == func)
+ return 0;
+
+ if (!handle_func_repeats(tr, new_flags))
+ return -ENOMEM;
+
+ unregister_ftrace_function(tr->ops);
+ tr->ops->func = func;
+ register_ftrace_function(tr->ops);
return 0;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0aa6e6faa943..0de6837722da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
* - we are inside irq code
* - we just entered irq code
*
- * retunns 0 if
+ * returns 0 if
* - funcgraph-interrupts option is set
* - we are not inside irq code
*/
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 34dc1a712dcb..632ef88131a9 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -83,7 +83,7 @@ struct hwlat_sample {
u64 nmi_total_ts; /* Total time spent in NMIs */
struct timespec64 timestamp; /* wall time */
int nmi_count; /* # NMIs during this sample */
- int count; /* # of iteratons over threash */
+ int count; /* # of iterations over thresh */
};
/* keep the global state somewhere. */
@@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr)
}
/**
- * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ * stop_kthread - Inform the hardware latency sampling/detector kthread to stop
*
* This kicks the running hardware latency sampling/detector kernel thread and
* tells it to stop sampling now. Use this on unload and at system shutdown.
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6fe770d86dc3..ea6178cb5e33 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
kretprobe_perf_func(tk, ri, regs);
#endif
- return 0; /* We don't tweek kernel, so just return 0 */
+ return 0; /* We don't tweak kernel, so just return 0 */
}
NOKPROBE_SYMBOL(kretprobe_dispatcher);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 61255bad7e01..d0368a569bfa 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
va_list ap;
va_start(ap, fmt);
- trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+ trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
va_end(ap);
}
EXPORT_SYMBOL(trace_event_printf);
@@ -587,13 +587,26 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
return !trace_seq_has_overflowed(s);
}
+static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter,
+ unsigned long long ts)
+{
+ unsigned long secs, usec_rem;
+ unsigned long long t;
+
+ if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
+ t = ns2usecs(ts);
+ usec_rem = do_div(t, USEC_PER_SEC);
+ secs = (unsigned long)t;
+ trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem);
+ } else
+ trace_seq_printf(s, " %12llu", ts);
+}
+
int trace_print_context(struct trace_iterator *iter)
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
- unsigned long long t;
- unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
trace_find_cmdline(entry->pid, comm);
@@ -614,13 +627,8 @@ int trace_print_context(struct trace_iterator *iter)
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
- if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
- t = ns2usecs(iter->ts);
- usec_rem = do_div(t, USEC_PER_SEC);
- secs = (unsigned long)t;
- trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
- } else
- trace_seq_printf(s, " %12llu: ", iter->ts);
+ trace_print_time(s, iter, iter->ts);
+ trace_seq_puts(s, ": ");
return !trace_seq_has_overflowed(s);
}
@@ -837,6 +845,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
return trace_handle_return(&iter->seq);
}
+static void print_fn_trace(struct trace_seq *s, unsigned long ip,
+ unsigned long parent_ip, int flags)
+{
+ seq_print_ip_sym(s, ip, flags);
+
+ if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, parent_ip, flags);
+ }
+}
+
/* TRACE_FN */
static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -846,13 +865,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
-
- if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- trace_seq_puts(s, " <-");
- seq_print_ip_sym(s, field->parent_ip, flags);
- }
-
+ print_fn_trace(s, field->ip, field->parent_ip, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1373,6 +1386,51 @@ static struct trace_event trace_raw_data_event = {
.funcs = &trace_raw_data_funcs,
};
+static enum print_line_t
+trace_func_repeats_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct func_repeats_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "%lu %lu %u %llu\n",
+ field->ip,
+ field->parent_ip,
+ field->count,
+ FUNC_REPEATS_GET_DELTA_TS(field));
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_func_repeats_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct func_repeats_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ print_fn_trace(s, field->ip, field->parent_ip, flags);
+ trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
+ trace_print_time(s, iter,
+ iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
+ trace_seq_puts(s, ")\n");
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_func_repeats_funcs = {
+ .trace = trace_func_repeats_print,
+ .raw = trace_func_repeats_raw,
+};
+
+static struct trace_event trace_func_repeats_event = {
+ .type = TRACE_FUNC_REPEATS,
+ .funcs = &trace_func_repeats_funcs,
+};
static struct trace_event *events[] __initdata = {
&trace_fn_event,
@@ -1385,6 +1443,7 @@ static struct trace_event *events[] __initdata = {
&trace_print_event,
&trace_hwlat_event,
&trace_raw_data_event,
+ &trace_func_repeats_event,
NULL
};
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index ff32476df072..4b320fe7df70 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
}
EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+bool trace_is_tracepoint_string(const char *str)
+{
+ const char **ptr = __start___tracepoint_str;
+
+ for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) {
+ if (str == *ptr)
+ return true;
+ }
+ return false;
+}
+
static const char **find_next(void *v, loff_t *pos)
{
const char **fmt = v;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index ec589a4612df..15413ad7cef2 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type)
if (!trace_probe_log.argv)
return;
- /* Recalcurate the length and allocate buffer */
+ /* Recalculate the length and allocate buffer */
for (i = 0; i < trace_probe_log.argc; i++) {
if (i == trace_probe_log.index)
pos = len;
@@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type)
/**
* Set the error position is next to the last arg + space.
* Note that len includes the terminal null and the cursor
- * appaers at pos + 1.
+ * appears at pos + 1.
*/
pos = len;
offset = 0;
@@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
}
/*
- * Since $comm and immediate string can not be dereferred,
+ * Since $comm and immediate string can not be dereferenced,
* we can find those by strcmp.
*/
if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 7ce4027089ee..227d518e5ba5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -134,7 +134,7 @@ struct fetch_type {
size_t size; /* Byte size of type */
int is_signed; /* Signed flag */
print_type_func_t print; /* Print functions */
- const char *fmt; /* Fromat string */
+ const char *fmt; /* Format string */
const char *fmttype; /* Name in format file */
};
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index e5282828f4a6..f003c5d02a3a 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -167,7 +167,7 @@ array:
return code->op == FETCH_OP_END ? ret : -EILSEQ;
}
-/* Sum up total data length for dynamic arraies (strings) */
+/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
{
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 73ef12092250..adf7ef194005 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
int ret;
/*
- * Now that the big kernel lock is no longer preemptable,
+ * Now that the big kernel lock is no longer preemptible,
* and this is called with the BKL held, it will always
* fail. If preemption is already disabled, simply
* pass the test. When the BKL is removed, or becomes
@@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
int ret;
/*
- * Now that the big kernel lock is no longer preemptable,
+ * Now that the big kernel lock is no longer preemptible,
* and this is called with the BKL held, it will always
* fail. If preemption is already disabled, simply
* pass the test. When the BKL is removed, or becomes
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1d84fcc78e3e..9c90b3a7dce2 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -16,7 +16,7 @@
* The buffer size is currently PAGE_SIZE, although it may become dynamic
* in the future.
*
- * A write to the buffer will either succed or fail. That is, unlike
+ * A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
* the buffer but it wont update the pointers). This allows users to
* try to write something into the trace_seq buffer and if it fails
@@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
* @fmt: printf format string
*
* The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
* trace_seq_printf() is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
@@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* @fmt: printf format string
*
* The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
* trace_seq_printf is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
@@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
* @c: simple character to record
*
* The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple charater
+ * copy to user routines. This function records a simple character
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
*/
@@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path)
EXPORT_SYMBOL_GPL(trace_seq_path);
/**
- * trace_seq_to_user - copy the squence buffer to user space
+ * trace_seq_to_user - copy the sequence buffer to user space
* @s: trace sequence descriptor
* @ubuf: The userspace memory location to copy to
* @cnt: The amount to copy
@@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*
* On failure it returns -EBUSY if all of the content in the
* sequence has been already read, which includes nothing in the
- * sequenc (@s->len == @s->readpos).
+ * sequence (@s->len == @s->readpos).
*
* Returns -EFAULT if the copy to userspace fails.
*/
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 11b1596e2542..8d8874f1c35e 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+#ifdef CONFIG_FANOTIFY
+ UCOUNT_ENTRY("max_fanotify_groups"),
+ UCOUNT_ENTRY("max_fanotify_marks"),
+#endif
{ }
};
#endif /* CONFIG_SYSCTL */
diff --git a/kernel/umh.c b/kernel/umh.c
index 3f646613a9d3..36c123360ab8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -27,6 +27,7 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
+#include <linux/initrd.h>
#include <trace/events/module.h>
@@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
+ wait_for_initramfs();
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
@@ -336,8 +338,8 @@ static void helper_unlock(void)
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
* @init: an init function
+ * @cleanup: a cleanup function
* @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
@@ -348,7 +350,7 @@ static void helper_unlock(void)
* exec. A non-zero return code causes the process to error out, exit,
* and return the failure to the calling process
*
- * The cleanup function is just before ethe subprocess_info is about to
+ * The cleanup function is just before the subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
@@ -384,7 +386,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
+ * @sub_info: information about the subprocess
* @wait: wait for the application to finish and return status.
* when UMH_NO_WAIT don't wait at all, but you get no useful error back
* when the program couldn't be exec'ed. This makes it safe to call
diff --git a/kernel/up.c b/kernel/up.c
index c6f323dcd45b..a38b8b095251 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
unsigned long flags;
@@ -36,38 +36,9 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
}
EXPORT_SYMBOL(smp_call_function_single_async);
-void on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(on_each_cpu);
-
-/*
- * Note we still need to test the mask even for UP
- * because we actually can get an empty mask from
- * code that on SMP might call us without the local
- * CPU in the mask.
- */
-void on_each_cpu_mask(const struct cpumask *mask,
- smp_call_func_t func, void *info, bool wait)
-{
- unsigned long flags;
-
- if (cpumask_test_cpu(0, mask)) {
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- }
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-
/*
* Preemption is disabled here to make sure the cond_func is called under the
- * same condtions in UP and SMP.
+ * same conditions in UP and SMP.
*/
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask)
@@ -75,7 +46,7 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
unsigned long flags;
preempt_disable();
- if (cond_func(0, info)) {
+ if ((!cond_func || cond_func(0, info)) && cpumask_test_cpu(0, mask)) {
local_irq_save(flags);
func(info);
local_irq_restore(flags);
@@ -84,13 +55,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait)
-{
- on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
int ret;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9a4b980d695b..8d62863721b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -85,7 +85,7 @@ int create_user_ns(struct cred *new)
/*
* Verify that we can not violate the policy of which files
* may be accessed that is specified by the root directory,
- * by verifing that the root directory is at the root of the
+ * by verifying that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
ret = -EPERM;
@@ -1014,7 +1014,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
goto out;
ret = -EINVAL;
}
- /* Be very certaint the new map actually exists */
+ /* Be very certain the new map actually exists */
if (new_map.nr_extents == 0)
goto out;
@@ -1169,7 +1169,7 @@ static bool new_idmap_permitted(const struct file *file,
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
- * And the opener of the id file also had the approprpiate capability.
+ * And the opener of the id file also has the appropriate capability.
*/
if (ns_capable(ns->parent, cap_setid) &&
file_ns_capable(file, ns->parent, cap_setid))
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 107bc38b1945..7c397907d0e9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,11 @@ static void lockup_detector_update_enable(void)
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-#define SOFTLOCKUP_RESET ULONG_MAX
+/*
+ * Delay the soflockup report when running a known slow code.
+ * It does _not_ affect the timestamp of the last successdul reschedule.
+ */
+#define SOFTLOCKUP_DELAY_REPORT ULONG_MAX
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
@@ -169,10 +173,12 @@ unsigned int __read_mostly softlockup_panic =
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
+/* Timestamp taken after the last successful reschedule. */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+/* Timestamp of the last softlockup report. */
+static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
@@ -235,10 +241,16 @@ static void set_sample_period(void)
watchdog_update_hrtimer_threshold(sample_period);
}
+static void update_report_ts(void)
+{
+ __this_cpu_write(watchdog_report_ts, get_timestamp());
+}
+
/* Commands for resetting the watchdog */
-static void __touch_watchdog(void)
+static void update_touch_ts(void)
{
__this_cpu_write(watchdog_touch_ts, get_timestamp());
+ update_report_ts();
}
/**
@@ -252,10 +264,10 @@ static void __touch_watchdog(void)
notrace void touch_softlockup_watchdog_sched(void)
{
/*
- * Preemption can be enabled. It doesn't matter which CPU's timestamp
- * gets zeroed here, so use the raw_ operation.
+ * Preemption can be enabled. It doesn't matter which CPU's watchdog
+ * report period gets restarted here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
+ raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}
notrace void touch_softlockup_watchdog(void)
@@ -279,7 +291,7 @@ void touch_all_softlockup_watchdogs(void)
* the softlockup check.
*/
for_each_cpu(cpu, &watchdog_allowed_mask) {
- per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
+ per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT;
wq_watchdog_touch(cpu);
}
}
@@ -287,16 +299,16 @@ void touch_all_softlockup_watchdogs(void)
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
+ __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}
-static int is_softlockup(unsigned long touch_ts)
+static int is_softlockup(unsigned long touch_ts, unsigned long period_ts)
{
unsigned long now = get_timestamp();
if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
/* Warn about unreasonable delays. */
- if (time_after(now, touch_ts + get_softlockup_thresh()))
+ if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
}
return 0;
@@ -332,7 +344,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
*/
static int softlockup_fn(void *data)
{
- __touch_watchdog();
+ update_touch_ts();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
@@ -342,6 +354,7 @@ static int softlockup_fn(void *data)
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
+ unsigned long period_ts = __this_cpu_read(watchdog_report_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
@@ -363,7 +376,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == SOFTLOCKUP_RESET) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like a soft lockup. Check to see if the host
+ * stopped the vm before we process the timestamps.
+ */
+ kvm_check_and_clear_guest_paused();
+
+ /* Reset the interval when touched by known problematic code. */
+ if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
@@ -373,9 +394,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
sched_clock_tick();
}
- /* Clear the guest paused flag on watchdog reset */
- kvm_check_and_clear_guest_paused();
- __touch_watchdog();
+ update_report_ts();
return HRTIMER_RESTART;
}
@@ -385,31 +404,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
* indicate it is getting cpu time. If it hasn't then
* this is a good indication some task is hogging the cpu
*/
- duration = is_softlockup(touch_ts);
+ duration = is_softlockup(touch_ts, period_ts);
if (unlikely(duration)) {
/*
- * If a virtual machine is stopped by the host it can look to
- * the watchdog like a soft lockup, check to see if the host
- * stopped the vm before we issue the warning
+ * Prevent multiple soft-lockup reports if one cpu is already
+ * engaged in dumping all cpu back traces.
*/
- if (kvm_check_and_clear_guest_paused())
- return HRTIMER_RESTART;
-
- /* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
- return HRTIMER_RESTART;
-
if (softlockup_all_cpu_backtrace) {
- /* Prevent multiple soft-lockup reports if one cpu is already
- * engaged in dumping cpu back traces
- */
- if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
- /* Someone else will report us. Let's give up */
- __this_cpu_write(soft_watchdog_warn, true);
+ if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn))
return HRTIMER_RESTART;
- }
}
+ /* Start period for the next softlockup warning. */
+ update_report_ts();
+
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -421,22 +429,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
dump_stack();
if (softlockup_all_cpu_backtrace) {
- /* Avoid generating two back traces for current
- * given that one is already made above
- */
trigger_allbutself_cpu_backtrace();
-
- clear_bit(0, &soft_lockup_nmi_warn);
- /* Barrier to sync with other cpus */
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &soft_lockup_nmi_warn);
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
- __this_cpu_write(soft_watchdog_warn, true);
- } else
- __this_cpu_write(soft_watchdog_warn, false);
+ }
return HRTIMER_RESTART;
}
@@ -461,7 +461,7 @@ static void watchdog_enable(unsigned int cpu)
HRTIMER_MODE_REL_PINNED_HARD);
/* Initialize timestamp */
- __touch_watchdog();
+ update_touch_ts();
/* Enable the perf event */
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
watchdog_nmi_enable(cpu);