summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt2
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c7
-rw-r--r--kernel/audit_tree.c19
-rw-r--r--kernel/audit_watch.c41
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c14
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c28
-rw-r--r--kernel/bpf/cgroup.c162
-rw-r--r--kernel/bpf/core.c79
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--kernel/bpf/devmap.c2
-rw-r--r--kernel/bpf/hashtab.c26
-rw-r--r--kernel/bpf/helpers.c20
-rw-r--r--kernel/bpf/inode.c11
-rw-r--r--kernel/bpf/local_storage.c379
-rw-r--r--kernel/bpf/lpm_trie.c12
-rw-r--r--kernel/bpf/map_in_map.c3
-rw-r--r--kernel/bpf/offload.c223
-rw-r--r--kernel/bpf/reuseport_array.c363
-rw-r--r--kernel/bpf/sockmap.c125
-rw-r--r--kernel/bpf/stackmap.c1
-rw-r--r--kernel/bpf/syscall.c103
-rw-r--r--kernel/bpf/verifier.c76
-rw-r--r--kernel/bpf/xskmap.c4
-rw-r--r--kernel/cgroup/cgroup-internal.h26
-rw-r--r--kernel/cgroup/cgroup-v1.c4
-rw-r--r--kernel/cgroup/cgroup.c16
-rw-r--r--kernel/cpu.c288
-rw-r--r--kernel/crash_core.c8
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/dma/direct.c9
-rw-r--r--kernel/dma/noncoherent.c8
-rw-r--r--kernel/dma/swiotlb.c18
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/events/uprobes.c78
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c89
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hung_task.c15
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/livepatch/transition.c7
-rw-r--r--kernel/locking/lockdep.c35
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/mutex.c345
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/memremap.c13
-rw-r--r--kernel/module-internal.h25
-rw-r--r--kernel/module.c177
-rw-r--r--kernel/module_signing.c12
-rw-r--r--kernel/pid.c42
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/hibernate.c16
-rw-r--r--kernel/power/main.c12
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk/internal.h9
-rw-r--r--kernel/printk/printk.c201
-rw-r--r--kernel/printk/printk_safe.c58
-rw-r--r--kernel/reboot.c6
-rw-r--r--kernel/sched/core.c34
-rw-r--r--kernel/sched/fair.c1
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/signal.c268
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sys.c95
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time/itimer.c5
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/posix-timers.c34
-rw-r--r--kernel/trace/Kconfig78
-rw-r--r--kernel/trace/Makefile13
-rw-r--r--kernel/trace/blktrace.c24
-rw-r--r--kernel/trace/bpf_trace.c5
-rw-r--r--kernel/trace/ftrace.c44
-rw-r--r--kernel/trace/preemptirq_delay_test.c72
-rw-r--r--kernel/trace/ring_buffer.c5
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c11
-rw-r--r--kernel/trace/trace.h23
-rw-r--r--kernel/trace/trace_benchmark.h2
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_event_perf.c1
-rw-r--r--kernel/trace/trace_events.c13
-rw-r--r--kernel/trace/trace_events_filter.c30
-rw-r--r--kernel/trace/trace_events_filter_test.h2
-rw-r--r--kernel/trace/trace_events_hist.c13
-rw-r--r--kernel/trace/trace_events_trigger.c27
-rw-r--r--kernel/trace/trace_hwlat.c7
-rw-r--r--kernel/trace/trace_irqsoff.c270
-rw-r--r--kernel/trace/trace_kprobe.c123
-rw-r--r--kernel/trace/trace_kprobe_selftest.c10
-rw-r--r--kernel/trace/trace_kprobe_selftest.h7
-rw-r--r--kernel/trace/trace_output.c1
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_preemptirq.c89
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_probe.c14
-rw-r--r--kernel/trace/trace_probe.h14
-rw-r--r--kernel/trace/trace_seq.c1
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_uprobe.c16
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/trace/tracing_map.h2
-rw-r--r--kernel/tracepoint.c97
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c24
-rw-r--r--kernel/utsname_sysctl.c41
-rw-r--r--kernel/workqueue.c45
116 files changed, 3418 insertions, 1525 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c97419f02..cd1655122ec0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -18,6 +18,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
+ depends on !ARCH_NO_PREEMPT
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -35,6 +36,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
+ depends on !ARCH_NO_PREEMPT
select PREEMPT_COUNT
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
diff --git a/kernel/Makefile b/kernel/Makefile
index 04bc07c2b42a..7a63d567fdb5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -123,7 +123,7 @@ targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
diff --git a/kernel/audit.c b/kernel/audit.c
index e7478cb58079..2a8058764aa6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -83,9 +83,6 @@
#define AUDIT_INITIALIZED 1
static int audit_initialized;
-#define AUDIT_OFF 0
-#define AUDIT_ON 1
-#define AUDIT_LOCKED 2
u32 audit_enabled = AUDIT_OFF;
bool audit_ever_enabled = !!AUDIT_OFF;
@@ -1724,7 +1721,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- *t = current_kernel_time64();
+ ktime_get_coarse_real_ts64(t);
*serial = audit_serial();
}
}
@@ -1754,7 +1751,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
- if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
+ if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
return NULL;
/* NOTE: don't ever fail/sleep on these two conditions:
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index c99ebaae5abc..ea43181cde4a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -168,7 +168,8 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
/* Function to return search key in our hash from inode. */
static unsigned long inode_to_key(const struct inode *inode)
{
- return (unsigned long)inode;
+ /* Use address pointed to by connector->obj as the key */
+ return (unsigned long)&inode->i_fsnotify_marks;
}
/*
@@ -183,7 +184,7 @@ static unsigned long chunk_to_key(struct audit_chunk *chunk)
*/
if (WARN_ON_ONCE(!chunk->mark.connector))
return 0;
- return (unsigned long)chunk->mark.connector->inode;
+ return (unsigned long)chunk->mark.connector->obj;
}
static inline struct list_head *chunk_hash(unsigned long key)
@@ -258,7 +259,7 @@ static void untag_chunk(struct node *p)
spin_lock(&entry->lock);
/*
* mark_mutex protects mark from getting detached and thus also from
- * mark->connector->inode getting NULL.
+ * mark->connector->obj getting NULL.
*/
if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
spin_unlock(&entry->lock);
@@ -288,8 +289,8 @@ static void untag_chunk(struct node *p)
if (!new)
goto Fallback;
- if (fsnotify_add_inode_mark_locked(&new->mark, entry->connector->inode,
- 1)) {
+ if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj,
+ FSNOTIFY_OBJ_TYPE_INODE, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -423,7 +424,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_lock(&old_entry->lock);
/*
* mark_mutex protects mark from getting detached and thus also from
- * mark->connector->inode getting NULL.
+ * mark->connector->obj getting NULL.
*/
if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
/* old_entry is being shot, lets just lie */
@@ -434,8 +435,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return -ENOENT;
}
- if (fsnotify_add_inode_mark_locked(chunk_entry,
- old_entry->connector->inode, 1)) {
+ if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj,
+ FSNOTIFY_OBJ_TYPE_INODE, 1)) {
spin_unlock(&old_entry->lock);
mutex_unlock(&old_entry->group->mark_mutex);
fsnotify_put_mark(chunk_entry);
@@ -497,6 +498,8 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
{
struct audit_buffer *ab;
+ if (!audit_enabled)
+ return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index c17c0c268436..787c7afdf829 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -238,20 +238,21 @@ out:
static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
{
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
- if (unlikely(!ab))
- return;
- audit_log_format(ab, "auid=%u ses=%u op=%s",
- from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current), op);
- audit_log_format(ab, " path=");
- audit_log_untrustedstring(ab, w->path);
- audit_log_key(ab, r->filterkey);
- audit_log_format(ab, " list=%d res=1", r->listnr);
- audit_log_end(ab);
- }
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (!ab)
+ return;
+ audit_log_format(ab, "auid=%u ses=%u op=%s",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ audit_get_sessionid(current), op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, w->path);
+ audit_log_key(ab, r->filterkey);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
}
/* Update inode info in audit rules based on filesystem event. */
@@ -419,6 +420,13 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
struct path parent_path;
int h, ret = 0;
+ /*
+ * When we will be calling audit_add_to_parent, krule->watch might have
+ * been updated and watch might have been freed.
+ * So we need to keep a reference of watch.
+ */
+ audit_get_watch(watch);
+
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
@@ -427,8 +435,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- if (ret)
+ if (ret) {
+ audit_put_watch(watch);
return ret;
+ }
/* either find an old parent or attach a new one */
parent = audit_find_parent(d_backing_inode(parent_path.dentry));
@@ -446,6 +456,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
*list = &audit_inode_hash[h];
error:
path_put(&parent_path);
+ audit_put_watch(watch);
return ret;
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eaa320148d97..bf309f2592c4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -264,7 +264,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
;
}
@@ -337,7 +337,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
switch(f->type) {
case AUDIT_MSGTYPE:
- if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
@@ -428,8 +428,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_EXE:
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
- if (entry->rule.listnr != AUDIT_FILTER_EXIT)
- return -EINVAL;
break;
}
return 0;
@@ -931,7 +929,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
/* If any of these, don't count towards total */
switch(entry->rule.listnr) {
case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
dont_count = 1;
}
@@ -1013,7 +1011,7 @@ int audit_del_rule(struct audit_entry *entry)
/* If any of these, don't count towards total */
switch(entry->rule.listnr) {
case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
dont_count = 1;
}
@@ -1360,6 +1358,11 @@ int audit_filter(int msgtype, unsigned int listtype)
f->type, f->op, f->lsm_rule, NULL);
}
break;
+ case AUDIT_EXE:
+ result = audit_exe_compare(current, e->rule.exe);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ break;
default:
goto unlock_and_return;
}
@@ -1369,7 +1372,7 @@ int audit_filter(int msgtype, unsigned int listtype)
break;
}
if (result > 0) {
- if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE)
+ if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE)
ret = 0;
break;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 80d672a11088..b2d1f043f17f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -494,20 +494,20 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_gid_comparator(cred->gid, f->op, f->gid);
if (f->op == Audit_equal) {
if (!result)
- result = in_group_p(f->gid);
+ result = groups_search(cred->group_info, f->gid);
} else if (f->op == Audit_not_equal) {
if (result)
- result = !in_group_p(f->gid);
+ result = !groups_search(cred->group_info, f->gid);
}
break;
case AUDIT_EGID:
result = audit_gid_comparator(cred->egid, f->op, f->gid);
if (f->op == Audit_equal) {
if (!result)
- result = in_egroup_p(f->gid);
+ result = groups_search(cred->group_info, f->gid);
} else if (f->op == Audit_not_equal) {
if (result)
- result = !in_egroup_p(f->gid);
+ result = !groups_search(cred->group_info, f->gid);
}
break;
case AUDIT_SGID:
@@ -1544,10 +1544,10 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[2] = a3;
context->argv[3] = a4;
context->serial = 0;
- context->ctime = current_kernel_time64();
context->in_syscall = 1;
context->current_state = state;
context->ppid = 0;
+ ktime_get_coarse_real_ts64(&context->ctime);
}
/**
@@ -2466,7 +2466,7 @@ void audit_core_dumps(long signr)
if (signr == SIGQUIT) /* don't care for those */
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND);
if (unlikely(!ab))
return;
audit_log_task(ab);
@@ -2490,7 +2490,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP);
if (unlikely(!ab))
return;
audit_log_task(ab);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..0488b8258321 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,6 +3,7 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
@@ -22,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2aa55d030c77..0c17aab3ce5f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
}
/* Called from syscall */
-static int array_map_alloc_check(union bpf_attr *attr)
+int array_map_alloc_check(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
@@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
- u32 btf_key_id, u32 btf_value_id)
+static int array_map_check_btf(const struct bpf_map *map,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
{
- const struct btf_type *key_type, *value_type;
- u32 key_size, value_size;
u32 int_data;
- key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
- if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
return -EINVAL;
int_data = *(u32 *)(key_type + 1);
- /* bpf array can only take a u32 key. This check makes
- * sure that the btf matches the attr used during map_create.
+ /* bpf array can only take a u32 key. This check makes sure
+ * that the btf matches the attr used during map_create.
*/
- if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
- BTF_INT_OFFSET(int_data))
- return -EINVAL;
-
- value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
- if (!value_type || value_size != map->value_size)
+ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
return -EINVAL;
return 0;
@@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_check_btf = array_map_check_btf,
};
static int fd_array_map_alloc_check(union bpf_attr *attr)
@@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = bpf_fd_array_map_clear,
+ .map_check_btf = map_check_no_btf,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
+ .map_check_btf = map_check_no_btf,
};
#ifdef CONFIG_CGROUPS
@@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
+ .map_check_btf = map_check_no_btf,
};
#endif
@@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
+ .map_check_btf = map_check_no_btf,
};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 3d83ee7df381..6a7d931bbc55 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp)
list_for_each_entry_safe(pl, tmp, progs, node) {
list_del(&pl->node);
bpf_prog_put(pl->prog);
+ bpf_cgroup_storage_unlink(pl->storage);
+ bpf_cgroup_storage_free(pl->storage);
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
@@ -95,7 +97,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
struct bpf_prog_array __rcu **array)
{
- struct bpf_prog_array __rcu *progs;
+ struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
struct cgroup *p = cgrp;
int cnt = 0;
@@ -115,18 +117,20 @@ static int compute_effective_progs(struct cgroup *cgrp,
cnt = 0;
p = cgrp;
do {
- if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
- list_for_each_entry(pl,
- &p->bpf.progs[type], node) {
- if (!pl->prog)
- continue;
- rcu_dereference_protected(progs, 1)->
- progs[cnt++] = pl->prog;
- }
- p = cgroup_parent(p);
- } while (p);
+ if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ list_for_each_entry(pl, &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
- *array = progs;
+ progs->items[cnt].prog = pl->prog;
+ progs->items[cnt].cgroup_storage = pl->storage;
+ cnt++;
+ }
+ } while ((p = cgroup_parent(p)));
+
+ rcu_assign_pointer(*array, progs);
return 0;
}
@@ -173,6 +177,45 @@ cleanup:
return -ENOMEM;
}
+static int update_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type)
+{
+ struct cgroup_subsys_state *css;
+ int err;
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ return err;
+}
+
#define BPF_CGROUP_MAX_PROGS 64
/**
@@ -189,7 +232,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
{
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
- struct cgroup_subsys_state *css;
+ struct bpf_cgroup_storage *storage, *old_storage = NULL;
struct bpf_prog_list *pl;
bool pl_was_allocated;
int err;
@@ -211,72 +254,71 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
+ storage = bpf_cgroup_storage_alloc(prog);
+ if (IS_ERR(storage))
+ return -ENOMEM;
+
if (flags & BPF_F_ALLOW_MULTI) {
- list_for_each_entry(pl, progs, node)
- if (pl->prog == prog)
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog) {
/* disallow attaching the same prog twice */
+ bpf_cgroup_storage_free(storage);
return -EINVAL;
+ }
+ }
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl)
+ if (!pl) {
+ bpf_cgroup_storage_free(storage);
return -ENOMEM;
+ }
+
pl_was_allocated = true;
pl->prog = prog;
+ pl->storage = storage;
list_add_tail(&pl->node, progs);
} else {
if (list_empty(progs)) {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl)
+ if (!pl) {
+ bpf_cgroup_storage_free(storage);
return -ENOMEM;
+ }
pl_was_allocated = true;
list_add_tail(&pl->node, progs);
} else {
pl = list_first_entry(progs, typeof(*pl), node);
old_prog = pl->prog;
+ old_storage = pl->storage;
+ bpf_cgroup_storage_unlink(old_storage);
pl_was_allocated = false;
}
pl->prog = prog;
+ pl->storage = storage;
}
cgrp->bpf.flags[type] = flags;
- /* allocate and recompute effective prog arrays */
- css_for_each_descendant_pre(css, &cgrp->self) {
- struct cgroup *desc = container_of(css, struct cgroup, self);
-
- err = compute_effective_progs(desc, type, &desc->bpf.inactive);
- if (err)
- goto cleanup;
- }
-
- /* all allocations were successful. Activate all prog arrays */
- css_for_each_descendant_pre(css, &cgrp->self) {
- struct cgroup *desc = container_of(css, struct cgroup, self);
-
- activate_effective_progs(desc, type, desc->bpf.inactive);
- desc->bpf.inactive = NULL;
- }
+ err = update_effective_progs(cgrp, type);
+ if (err)
+ goto cleanup;
static_branch_inc(&cgroup_bpf_enabled_key);
+ if (old_storage)
+ bpf_cgroup_storage_free(old_storage);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_cgroup_storage_link(storage, cgrp, type);
return 0;
cleanup:
- /* oom while computing effective. Free all computed effective arrays
- * since they were not activated
- */
- css_for_each_descendant_pre(css, &cgrp->self) {
- struct cgroup *desc = container_of(css, struct cgroup, self);
-
- bpf_prog_array_free(desc->bpf.inactive);
- desc->bpf.inactive = NULL;
- }
-
/* and cleanup the prog list */
pl->prog = old_prog;
+ bpf_cgroup_storage_free(pl->storage);
+ pl->storage = old_storage;
+ bpf_cgroup_storage_link(old_storage, cgrp, type);
if (pl_was_allocated) {
list_del(&pl->node);
kfree(pl);
@@ -299,7 +341,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
struct list_head *progs = &cgrp->bpf.progs[type];
u32 flags = cgrp->bpf.flags[type];
struct bpf_prog *old_prog = NULL;
- struct cgroup_subsys_state *css;
struct bpf_prog_list *pl;
int err;
@@ -338,25 +379,14 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
}
- /* allocate and recompute effective prog arrays */
- css_for_each_descendant_pre(css, &cgrp->self) {
- struct cgroup *desc = container_of(css, struct cgroup, self);
-
- err = compute_effective_progs(desc, type, &desc->bpf.inactive);
- if (err)
- goto cleanup;
- }
-
- /* all allocations were successful. Activate all prog arrays */
- css_for_each_descendant_pre(css, &cgrp->self) {
- struct cgroup *desc = container_of(css, struct cgroup, self);
-
- activate_effective_progs(desc, type, desc->bpf.inactive);
- desc->bpf.inactive = NULL;
- }
+ err = update_effective_progs(cgrp, type);
+ if (err)
+ goto cleanup;
/* now can actually delete it from this cgroup list */
list_del(&pl->node);
+ bpf_cgroup_storage_unlink(pl->storage);
+ bpf_cgroup_storage_free(pl->storage);
kfree(pl);
if (list_empty(progs))
/* last program was detached, reset flags to zero */
@@ -367,16 +397,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
return 0;
cleanup:
- /* oom while computing effective. Free all computed effective arrays
- * since they were not activated
- */
- css_for_each_descendant_pre(css, &cgrp->self) {
- struct cgroup *desc = container_of(css, struct cgroup, self);
-
- bpf_prog_array_free(desc->bpf.inactive);
- desc->bpf.inactive = NULL;
- }
-
/* and restore back old_prog */
pl->prog = old_prog;
return err;
@@ -655,6 +675,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_delete_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1e5625d46414..3f5bf1af0826 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1538,11 +1538,12 @@ static struct {
.null_prog = NULL,
};
-struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
if (prog_cnt)
return kzalloc(sizeof(struct bpf_prog_array) +
- sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ sizeof(struct bpf_prog_array_item) *
+ (prog_cnt + 1),
flags);
return &empty_prog_array.hdr;
@@ -1556,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
kfree_rcu(progs, rcu);
}
-int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
{
- struct bpf_prog **prog;
+ struct bpf_prog_array_item *item;
u32 cnt = 0;
rcu_read_lock();
- prog = rcu_dereference(progs)->progs;
- for (; *prog; prog++)
- if (*prog != &dummy_bpf_prog.prog)
+ item = rcu_dereference(array)->items;
+ for (; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
cnt++;
rcu_read_unlock();
return cnt;
}
-static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+
+static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
u32 *prog_ids,
u32 request_cnt)
{
+ struct bpf_prog_array_item *item;
int i = 0;
- for (; *prog; prog++) {
- if (*prog == &dummy_bpf_prog.prog)
+ item = rcu_dereference_check(array, 1)->items;
+ for (; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
continue;
- prog_ids[i] = (*prog)->aux->id;
+ prog_ids[i] = item->prog->aux->id;
if (++i == request_cnt) {
- prog++;
+ item++;
break;
}
}
- return !!(*prog);
+ return !!(item->prog);
}
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
__u32 __user *prog_ids, u32 cnt)
{
- struct bpf_prog **prog;
unsigned long err = 0;
bool nospc;
u32 *ids;
@@ -1611,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
if (!ids)
return -ENOMEM;
rcu_read_lock();
- prog = rcu_dereference(progs)->progs;
- nospc = bpf_prog_array_copy_core(prog, ids, cnt);
+ nospc = bpf_prog_array_copy_core(array, ids, cnt);
rcu_read_unlock();
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
kfree(ids);
@@ -1623,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
return 0;
}
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
struct bpf_prog *old_prog)
{
- struct bpf_prog **prog = progs->progs;
+ struct bpf_prog_array_item *item = array->items;
- for (; *prog; prog++)
- if (*prog == old_prog) {
- WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ for (; item->prog; item++)
+ if (item->prog == old_prog) {
+ WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
break;
}
}
@@ -1641,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
struct bpf_prog_array **new_array)
{
int new_prog_cnt, carry_prog_cnt = 0;
- struct bpf_prog **existing_prog;
+ struct bpf_prog_array_item *existing;
struct bpf_prog_array *array;
bool found_exclude = false;
int new_prog_idx = 0;
@@ -1650,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
* the new array.
*/
if (old_array) {
- existing_prog = old_array->progs;
- for (; *existing_prog; existing_prog++) {
- if (*existing_prog == exclude_prog) {
+ existing = old_array->items;
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog) {
found_exclude = true;
continue;
}
- if (*existing_prog != &dummy_bpf_prog.prog)
+ if (existing->prog != &dummy_bpf_prog.prog)
carry_prog_cnt++;
- if (*existing_prog == include_prog)
+ if (existing->prog == include_prog)
return -EEXIST;
}
}
@@ -1684,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
/* Fill in the new prog array */
if (carry_prog_cnt) {
- existing_prog = old_array->progs;
- for (; *existing_prog; existing_prog++)
- if (*existing_prog != exclude_prog &&
- *existing_prog != &dummy_bpf_prog.prog)
- array->progs[new_prog_idx++] = *existing_prog;
+ existing = old_array->items;
+ for (; existing->prog; existing++)
+ if (existing->prog != exclude_prog &&
+ existing->prog != &dummy_bpf_prog.prog) {
+ array->items[new_prog_idx++].prog =
+ existing->prog;
+ }
}
if (include_prog)
- array->progs[new_prog_idx++] = include_prog;
- array->progs[new_prog_idx] = NULL;
+ array->items[new_prog_idx++].prog = include_prog;
+ array->items[new_prog_idx].prog = NULL;
*new_array = array;
return 0;
}
@@ -1701,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt)
{
- struct bpf_prog **prog;
u32 cnt = 0;
if (array)
@@ -1714,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
return 0;
/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
- prog = rcu_dereference_check(array, 1)->progs;
- return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
: 0;
}
@@ -1793,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 46f5f29605d4..24aac0d0f412 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map)
* It does __not__ ensure pending flush operations (if any) are
* complete.
*/
+
+ bpf_clear_redirect_map(map);
synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush
@@ -555,6 +557,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_update_elem = cpu_map_update_elem,
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
+ .map_check_btf = map_check_no_btf,
};
static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 750d45edae79..141710b82a6c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
+ bpf_clear_redirect_map(map);
synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush
@@ -488,6 +489,7 @@ const struct bpf_map_ops dev_map_ops = {
.map_lookup_elem = dev_map_lookup_elem,
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
};
static int dev_map_notification(struct notifier_block *notifier,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 513d9dfcf4ee..04b8eda94e7d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -11,9 +11,11 @@
* General Public License for more details.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
+#include <uapi/linux/btf.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
@@ -1162,6 +1164,27 @@ static void htab_map_free(struct bpf_map *map)
kfree(htab);
}
+static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+
+ rcu_read_lock();
+
+ value = htab_map_lookup_elem(map, key);
+ if (!value) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_puts(m, "\n");
+
+ rcu_read_unlock();
+}
+
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1171,6 +1194,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
+ .map_seq_show_elem = htab_map_seq_show_elem,
};
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1182,6 +1206,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
+ .map_seq_show_elem = htab_map_seq_show_elem,
};
/* Called from eBPF program */
@@ -1408,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
+ .map_check_btf = map_check_no_btf,
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 73065e2d23c2..1991466b8327 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
};
+
+DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* map and flags arguments are not used now,
+ * but provide an ability to extend the API
+ * for other types of local storages.
+ * verifier checks that their values are correct.
+ */
+ return (unsigned long) this_cpu_read(bpf_cgroup_storage);
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 76efe9a183f5..2ada5e21dfa6 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
struct bpf_map *map = seq_file_to_map(m);
void *key = map_iter(m)->key;
+ void *prev_key;
if (map_iter(m)->done)
return NULL;
if (unlikely(v == SEQ_START_TOKEN))
- goto done;
+ prev_key = NULL;
+ else
+ prev_key = key;
- if (map->ops->map_get_next_key(map, key, key)) {
+ if (map->ops->map_get_next_key(map, prev_key, key)) {
map_iter(m)->done = true;
return NULL;
}
-done:
++(*pos);
return key;
}
@@ -332,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
struct bpf_map *map = arg;
return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
- map->btf ? &bpffs_map_fops : &bpffs_obj_fops);
+ bpf_map_support_seq_show(map) ?
+ &bpffs_map_fops : &bpffs_obj_fops);
}
static struct dentry *
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
new file mode 100644
index 000000000000..22ad967d1e5f
--- /dev/null
+++ b/kernel/bpf/local_storage.c
@@ -0,0 +1,379 @@
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf.h>
+#include <linux/bug.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+DEFINE_PER_CPU(void*, bpf_cgroup_storage);
+
+#ifdef CONFIG_CGROUP_BPF
+
+#define LOCAL_STORAGE_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_cgroup_storage_map {
+ struct bpf_map map;
+
+ spinlock_t lock;
+ struct bpf_prog *prog;
+ struct rb_root root;
+ struct list_head list;
+};
+
+static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
+{
+ return container_of(map, struct bpf_cgroup_storage_map, map);
+}
+
+static int bpf_cgroup_storage_key_cmp(
+ const struct bpf_cgroup_storage_key *key1,
+ const struct bpf_cgroup_storage_key *key2)
+{
+ if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+ return -1;
+ else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+ return 1;
+ else if (key1->attach_type < key2->attach_type)
+ return -1;
+ else if (key1->attach_type > key2->attach_type)
+ return 1;
+ return 0;
+}
+
+static struct bpf_cgroup_storage *cgroup_storage_lookup(
+ struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
+ bool locked)
+{
+ struct rb_root *root = &map->root;
+ struct rb_node *node;
+
+ if (!locked)
+ spin_lock_bh(&map->lock);
+
+ node = root->rb_node;
+ while (node) {
+ struct bpf_cgroup_storage *storage;
+
+ storage = container_of(node, struct bpf_cgroup_storage, node);
+
+ switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+ case -1:
+ node = node->rb_left;
+ break;
+ case 1:
+ node = node->rb_right;
+ break;
+ default:
+ if (!locked)
+ spin_unlock_bh(&map->lock);
+ return storage;
+ }
+ }
+
+ if (!locked)
+ spin_unlock_bh(&map->lock);
+
+ return NULL;
+}
+
+static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
+ struct bpf_cgroup_storage *storage)
+{
+ struct rb_root *root = &map->root;
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ while (*new) {
+ struct bpf_cgroup_storage *this;
+
+ this = container_of(*new, struct bpf_cgroup_storage, node);
+
+ parent = *new;
+ switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+ case -1:
+ new = &((*new)->rb_left);
+ break;
+ case 1:
+ new = &((*new)->rb_right);
+ break;
+ default:
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&storage->node, parent, new);
+ rb_insert_color(&storage->node, root);
+
+ return 0;
+}
+
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage)
+ return NULL;
+
+ return &READ_ONCE(storage->buf)->data[0];
+}
+
+static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+ void *value, u64 flags)
+{
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+ struct bpf_storage_buffer *new;
+
+ if (flags & BPF_NOEXIST)
+ return -EINVAL;
+
+ storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
+ key, false);
+ if (!storage)
+ return -ENOENT;
+
+ new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+ map->value_size, __GFP_ZERO | GFP_USER,
+ map->numa_node);
+ if (!new)
+ return -ENOMEM;
+
+ memcpy(&new->data[0], value, map->value_size);
+
+ new = xchg(&storage->buf, new);
+ kfree_rcu(new, rcu);
+
+ return 0;
+}
+
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+ void *_next_key)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage_key *next = _next_key;
+ struct bpf_cgroup_storage *storage;
+
+ spin_lock_bh(&map->lock);
+
+ if (list_empty(&map->list))
+ goto enoent;
+
+ if (key) {
+ storage = cgroup_storage_lookup(map, key, true);
+ if (!storage)
+ goto enoent;
+
+ storage = list_next_entry(storage, list);
+ if (!storage)
+ goto enoent;
+ } else {
+ storage = list_first_entry(&map->list,
+ struct bpf_cgroup_storage, list);
+ }
+
+ spin_unlock_bh(&map->lock);
+ next->attach_type = storage->key.attach_type;
+ next->cgroup_inode_id = storage->key.cgroup_inode_id;
+ return 0;
+
+enoent:
+ spin_unlock_bh(&map->lock);
+ return -ENOENT;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_cgroup_storage_map *map;
+
+ if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > PAGE_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
+ /* reserved bits should not be used */
+ return ERR_PTR(-EINVAL);
+
+ if (attr->max_entries)
+ /* max_entries is not used and enforced to be 0 */
+ return ERR_PTR(-EINVAL);
+
+ map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
+ __GFP_ZERO | GFP_USER, numa_node);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&map->map, attr);
+
+ spin_lock_init(&map->lock);
+ map->root = RB_ROOT;
+ INIT_LIST_HEAD(&map->list);
+
+ return &map->map;
+}
+
+static void cgroup_storage_map_free(struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+ WARN_ON(!RB_EMPTY_ROOT(&map->root));
+ WARN_ON(!list_empty(&map->list));
+
+ kfree(map);
+}
+
+static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops cgroup_storage_map_ops = {
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = cgroup_storage_get_next_key,
+ .map_lookup_elem = cgroup_storage_lookup_elem,
+ .map_update_elem = cgroup_storage_update_elem,
+ .map_delete_elem = cgroup_storage_delete_elem,
+ .map_check_btf = map_check_no_btf,
+};
+
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ int ret = -EBUSY;
+
+ spin_lock_bh(&map->lock);
+
+ if (map->prog && map->prog != prog)
+ goto unlock;
+ if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+ goto unlock;
+
+ map->prog = prog;
+ prog->aux->cgroup_storage = _map;
+ ret = 0;
+unlock:
+ spin_unlock_bh(&map->lock);
+
+ return ret;
+}
+
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+ spin_lock_bh(&map->lock);
+ if (map->prog == prog) {
+ WARN_ON(prog->aux->cgroup_storage != _map);
+ map->prog = NULL;
+ prog->aux->cgroup_storage = NULL;
+ }
+ spin_unlock_bh(&map->lock);
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+{
+ struct bpf_cgroup_storage *storage;
+ struct bpf_map *map;
+ u32 pages;
+
+ map = prog->aux->cgroup_storage;
+ if (!map)
+ return NULL;
+
+ pages = round_up(sizeof(struct bpf_cgroup_storage) +
+ sizeof(struct bpf_storage_buffer) +
+ map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ if (bpf_map_charge_memlock(map, pages))
+ return ERR_PTR(-EPERM);
+
+ storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
+ __GFP_ZERO | GFP_USER, map->numa_node);
+ if (!storage) {
+ bpf_map_uncharge_memlock(map, pages);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+ map->value_size, __GFP_ZERO | GFP_USER,
+ map->numa_node);
+ if (!storage->buf) {
+ bpf_map_uncharge_memlock(map, pages);
+ kfree(storage);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ storage->map = (struct bpf_cgroup_storage_map *)map;
+
+ return storage;
+}
+
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
+{
+ u32 pages;
+ struct bpf_map *map;
+
+ if (!storage)
+ return;
+
+ map = &storage->map->map;
+ pages = round_up(sizeof(struct bpf_cgroup_storage) +
+ sizeof(struct bpf_storage_buffer) +
+ map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ bpf_map_uncharge_memlock(map, pages);
+
+ kfree_rcu(storage->buf, rcu);
+ kfree_rcu(storage, rcu);
+}
+
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+ struct cgroup *cgroup,
+ enum bpf_attach_type type)
+{
+ struct bpf_cgroup_storage_map *map;
+
+ if (!storage)
+ return;
+
+ storage->key.attach_type = type;
+ storage->key.cgroup_inode_id = cgroup->kn->id.id;
+
+ map = storage->map;
+
+ spin_lock_bh(&map->lock);
+ WARN_ON(cgroup_storage_insert(map, storage));
+ list_add(&storage->list, &map->list);
+ spin_unlock_bh(&map->lock);
+}
+
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
+{
+ struct bpf_cgroup_storage_map *map;
+ struct rb_root *root;
+
+ if (!storage)
+ return;
+
+ map = storage->map;
+
+ spin_lock_bh(&map->lock);
+ root = &map->root;
+ rb_erase(&storage->node, root);
+
+ list_del(&storage->list);
+ spin_unlock_bh(&map->lock);
+}
+
+#endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1603492c9cc7..9058317ba9de 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -10,11 +10,13 @@
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <net/ipv6.h>
+#include <uapi/linux/btf.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -686,6 +688,15 @@ free_stack:
return err;
}
+static int trie_check_btf(const struct bpf_map *map,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Keys must have struct bpf_lpm_trie_key embedded. */
+ return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
+ -EINVAL : 0;
+}
+
const struct bpf_map_ops trie_map_ops = {
.map_alloc = trie_alloc,
.map_free = trie_free,
@@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = {
.map_lookup_elem = trie_lookup_elem,
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
+ .map_check_btf = trie_check_btf,
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 1da574612bea..3bfbf4464416 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -23,7 +23,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
* is a runtime binding. Doing static check alone
* in the verifier is not enough.
*/
- if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+ inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ac747d5cf7c6..177a52436394 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -18,19 +18,43 @@
#include <linux/bug.h>
#include <linux/kdev_t.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/proc_ns.h>
+#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
-/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+/* Protects offdevs, members of bpf_offload_netdev and offload members
* of all progs.
* RTNL lock cannot be taken when holding this lock.
*/
static DECLARE_RWSEM(bpf_devs_lock);
-static LIST_HEAD(bpf_prog_offload_devs);
-static LIST_HEAD(bpf_map_offload_devs);
+
+struct bpf_offload_dev {
+ struct list_head netdevs;
+};
+
+struct bpf_offload_netdev {
+ struct rhash_head l;
+ struct net_device *netdev;
+ struct bpf_offload_dev *offdev;
+ struct list_head progs;
+ struct list_head maps;
+ struct list_head offdev_netdevs;
+};
+
+static const struct rhashtable_params offdevs_params = {
+ .nelem_hint = 4,
+ .key_len = sizeof(struct net_device *),
+ .key_offset = offsetof(struct bpf_offload_netdev, netdev),
+ .head_offset = offsetof(struct bpf_offload_netdev, l),
+ .automatic_shrinking = true,
+};
+
+static struct rhashtable offdevs;
+static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -41,8 +65,19 @@ static int bpf_dev_offload_check(struct net_device *netdev)
return 0;
}
+static struct bpf_offload_netdev *
+bpf_offload_find_netdev(struct net_device *netdev)
+{
+ lockdep_assert_held(&bpf_devs_lock);
+
+ if (!offdevs_inited)
+ return NULL;
+ return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+}
+
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
{
+ struct bpf_offload_netdev *ondev;
struct bpf_prog_offload *offload;
int err;
@@ -66,12 +101,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
goto err_maybe_put;
down_write(&bpf_devs_lock);
- if (offload->netdev->reg_state != NETREG_REGISTERED) {
+ ondev = bpf_offload_find_netdev(offload->netdev);
+ if (!ondev) {
err = -EINVAL;
goto err_unlock;
}
prog->aux->offload = offload;
- list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+ list_add_tail(&offload->offloads, &ondev->progs);
dev_put(offload->netdev);
up_write(&bpf_devs_lock);
@@ -294,6 +330,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
+ struct bpf_offload_netdev *ondev;
struct bpf_offloaded_map *offmap;
int err;
@@ -316,11 +353,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
if (err)
goto err_unlock;
+ ondev = bpf_offload_find_netdev(offmap->netdev);
+ if (!ondev) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
if (err)
goto err_unlock;
- list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+ list_add_tail(&offmap->offloads, &ondev->maps);
up_write(&bpf_devs_lock);
rtnl_unlock();
@@ -468,77 +511,159 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
return 0;
}
-bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+static bool __bpf_offload_dev_match(struct bpf_prog *prog,
+ struct net_device *netdev)
{
- struct bpf_offloaded_map *offmap;
+ struct bpf_offload_netdev *ondev1, *ondev2;
struct bpf_prog_offload *offload;
- bool ret;
if (!bpf_prog_is_dev_bound(prog->aux))
return false;
- if (!bpf_map_is_dev_bound(map))
- return bpf_map_offload_neutral(map);
- down_read(&bpf_devs_lock);
offload = prog->aux->offload;
- offmap = map_to_offmap(map);
+ if (!offload)
+ return false;
+ if (offload->netdev == netdev)
+ return true;
- ret = offload && offload->netdev == offmap->netdev;
+ ondev1 = bpf_offload_find_netdev(offload->netdev);
+ ondev2 = bpf_offload_find_netdev(netdev);
+
+ return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
+{
+ bool ret;
+
+ down_read(&bpf_devs_lock);
+ ret = __bpf_offload_dev_match(prog, netdev);
up_read(&bpf_devs_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
-static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
- struct bpf_prog_offload *offload, *tmp;
+ struct bpf_offloaded_map *offmap;
+ bool ret;
- list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
- if (offload->netdev == netdev)
- __bpf_prog_offload_destroy(offload->prog);
+ if (!bpf_map_is_dev_bound(map))
+ return bpf_map_offload_neutral(map);
+ offmap = map_to_offmap(map);
+
+ down_read(&bpf_devs_lock);
+ ret = __bpf_offload_dev_match(prog, offmap->netdev);
+ up_read(&bpf_devs_lock);
+
+ return ret;
}
-static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
- struct bpf_offloaded_map *offmap, *tmp;
+ struct bpf_offload_netdev *ondev;
+ int err;
- list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
- if (offmap->netdev == netdev)
- __bpf_map_offload_destroy(offmap);
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
+
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ down_write(&bpf_devs_lock);
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_unlock_free;
+ }
+
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ up_write(&bpf_devs_lock);
+ return 0;
+
+err_unlock_free:
+ up_write(&bpf_devs_lock);
+ kfree(ondev);
+ return err;
}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
-static int bpf_offload_notification(struct notifier_block *notifier,
- ulong event, void *ptr)
+void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
- struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_offload_netdev *ondev, *altdev;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
ASSERT_RTNL();
- switch (event) {
- case NETDEV_UNREGISTER:
- /* ignore namespace changes */
- if (netdev->reg_state != NETREG_UNREGISTERING)
- break;
-
- down_write(&bpf_devs_lock);
- bpf_offload_orphan_all_progs(netdev);
- bpf_offload_orphan_all_maps(netdev);
- up_write(&bpf_devs_lock);
- break;
- default:
- break;
+ down_write(&bpf_devs_lock);
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ goto unlock;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+ list_del(&ondev->offdev_netdevs);
+
+ /* Try to move the objects to another netdev of the device */
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
}
- return NOTIFY_OK;
-}
-static struct notifier_block bpf_offload_notifier = {
- .notifier_call = bpf_offload_notification,
-};
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+unlock:
+ up_write(&bpf_devs_lock);
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
-static int __init bpf_offload_init(void)
+struct bpf_offload_dev *bpf_offload_dev_create(void)
{
- register_netdevice_notifier(&bpf_offload_notifier);
- return 0;
+ struct bpf_offload_dev *offdev;
+ int err;
+
+ down_write(&bpf_devs_lock);
+ if (!offdevs_inited) {
+ err = rhashtable_init(&offdevs, &offdevs_params);
+ if (err)
+ return ERR_PTR(err);
+ offdevs_inited = true;
+ }
+ up_write(&bpf_devs_lock);
+
+ offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
+ if (!offdev)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&offdev->netdevs);
+
+ return offdev;
}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_create);
-subsys_initcall(bpf_offload_init);
+void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
+{
+ WARN_ON(!list_empty(&offdev->netdevs));
+ kfree(offdev);
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
new file mode 100644
index 000000000000..18e225de80ff
--- /dev/null
+++ b/kernel/bpf/reuseport_array.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/sock_diag.h>
+#include <net/sock_reuseport.h>
+
+struct reuseport_array {
+ struct bpf_map map;
+ struct sock __rcu *ptrs[];
+};
+
+static struct reuseport_array *reuseport_array(struct bpf_map *map)
+{
+ return (struct reuseport_array *)map;
+}
+
+/* The caller must hold the reuseport_lock */
+void bpf_sk_reuseport_detach(struct sock *sk)
+{
+ struct sock __rcu **socks;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ socks = sk->sk_user_data;
+ if (socks) {
+ WRITE_ONCE(sk->sk_user_data, NULL);
+ /*
+ * Do not move this NULL assignment outside of
+ * sk->sk_callback_lock because there is
+ * a race with reuseport_array_free()
+ * which does not hold the reuseport_lock.
+ */
+ RCU_INIT_POINTER(*socks, NULL);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int reuseport_array_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64))
+ return -EINVAL;
+
+ return array_map_alloc_check(attr);
+}
+
+static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return rcu_dereference(array->ptrs[index]);
+}
+
+/* Called from syscall only */
+static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = *(u32 *)key;
+ struct sock *sk;
+ int err;
+
+ if (index >= map->max_entries)
+ return -E2BIG;
+
+ if (!rcu_access_pointer(array->ptrs[index]))
+ return -ENOENT;
+
+ spin_lock_bh(&reuseport_lock);
+
+ sk = rcu_dereference_protected(array->ptrs[index],
+ lockdep_is_held(&reuseport_lock));
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ WRITE_ONCE(sk->sk_user_data, NULL);
+ RCU_INIT_POINTER(array->ptrs[index], NULL);
+ write_unlock_bh(&sk->sk_callback_lock);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return err;
+}
+
+static void reuseport_array_free(struct bpf_map *map)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ struct sock *sk;
+ u32 i;
+
+ synchronize_rcu();
+
+ /*
+ * ops->map_*_elem() will not be able to access this
+ * array now. Hence, this function only races with
+ * bpf_sk_reuseport_detach() which was triggerred by
+ * close() or disconnect().
+ *
+ * This function and bpf_sk_reuseport_detach() are
+ * both removing sk from "array". Who removes it
+ * first does not matter.
+ *
+ * The only concern here is bpf_sk_reuseport_detach()
+ * may access "array" which is being freed here.
+ * bpf_sk_reuseport_detach() access this "array"
+ * through sk->sk_user_data _and_ with sk->sk_callback_lock
+ * held which is enough because this "array" is not freed
+ * until all sk->sk_user_data has stopped referencing this "array".
+ *
+ * Hence, due to the above, taking "reuseport_lock" is not
+ * needed here.
+ */
+
+ /*
+ * Since reuseport_lock is not taken, sk is accessed under
+ * rcu_read_lock()
+ */
+ rcu_read_lock();
+ for (i = 0; i < map->max_entries; i++) {
+ sk = rcu_dereference(array->ptrs[i]);
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ /*
+ * No need for WRITE_ONCE(). At this point,
+ * no one is reading it without taking the
+ * sk->sk_callback_lock.
+ */
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+ RCU_INIT_POINTER(array->ptrs[i], NULL);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Once reaching here, all sk->sk_user_data is not
+ * referenceing this "array". "array" can be freed now.
+ */
+ bpf_map_area_free(array);
+}
+
+static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
+{
+ int err, numa_node = bpf_map_attr_numa_node(attr);
+ struct reuseport_array *array;
+ u64 cost, array_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ array_size = sizeof(*array);
+ array_size += (u64)attr->max_entries * sizeof(struct sock *);
+
+ /* make sure there is no u32 overflow later in round_up() */
+ cost = array_size;
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-ENOMEM);
+ cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ err = bpf_map_precharge_memlock(cost);
+ if (err)
+ return ERR_PTR(err);
+
+ /* allocate all map elements and zero-initialize them */
+ array = bpf_map_area_alloc(array_size, numa_node);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&array->map, attr);
+ array->map.pages = cost;
+
+ return &array->map;
+}
+
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct sock *sk;
+ int err;
+
+ if (map->value_size != sizeof(u64))
+ return -ENOSPC;
+
+ rcu_read_lock();
+ sk = reuseport_array_lookup_elem(map, key);
+ if (sk) {
+ *(u64 *)value = sock_gen_cookie(sk);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int
+reuseport_array_update_check(const struct reuseport_array *array,
+ const struct sock *nsk,
+ const struct sock *osk,
+ const struct sock_reuseport *nsk_reuse,
+ u32 map_flags)
+{
+ if (osk && map_flags == BPF_NOEXIST)
+ return -EEXIST;
+
+ if (!osk && map_flags == BPF_EXIST)
+ return -ENOENT;
+
+ if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
+ return -ENOTSUPP;
+
+ if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
+ return -ENOTSUPP;
+
+ if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
+ return -ENOTSUPP;
+
+ /*
+ * sk must be hashed (i.e. listening in the TCP case or binded
+ * in the UDP case) and
+ * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
+ *
+ * Also, sk will be used in bpf helper that is protected by
+ * rcu_read_lock().
+ */
+ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
+ return -EINVAL;
+
+ /* READ_ONCE because the sk->sk_callback_lock may not be held here */
+ if (READ_ONCE(nsk->sk_user_data))
+ return -EBUSY;
+
+ return 0;
+}
+
+/*
+ * Called from syscall only.
+ * The "nsk" in the fd refcnt.
+ * The "osk" and "reuse" are protected by reuseport_lock.
+ */
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ struct sock *free_osk = NULL, *osk, *nsk;
+ struct sock_reuseport *reuse;
+ u32 index = *(u32 *)key;
+ struct socket *socket;
+ int err, fd;
+
+ if (map_flags > BPF_EXIST)
+ return -EINVAL;
+
+ if (index >= map->max_entries)
+ return -E2BIG;
+
+ if (map->value_size == sizeof(u64)) {
+ u64 fd64 = *(u64 *)value;
+
+ if (fd64 > S32_MAX)
+ return -EINVAL;
+ fd = fd64;
+ } else {
+ fd = *(int *)value;
+ }
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ nsk = socket->sk;
+ if (!nsk) {
+ err = -EINVAL;
+ goto put_file;
+ }
+
+ /* Quick checks before taking reuseport_lock */
+ err = reuseport_array_update_check(array, nsk,
+ rcu_access_pointer(array->ptrs[index]),
+ rcu_access_pointer(nsk->sk_reuseport_cb),
+ map_flags);
+ if (err)
+ goto put_file;
+
+ spin_lock_bh(&reuseport_lock);
+ /*
+ * Some of the checks only need reuseport_lock
+ * but it is done under sk_callback_lock also
+ * for simplicity reason.
+ */
+ write_lock_bh(&nsk->sk_callback_lock);
+
+ osk = rcu_dereference_protected(array->ptrs[index],
+ lockdep_is_held(&reuseport_lock));
+ reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
+ if (err)
+ goto put_file_unlock;
+
+ /* Ensure reuse->reuseport_id is set */
+ err = reuseport_get_id(reuse);
+ if (err < 0)
+ goto put_file_unlock;
+
+ WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]);
+ rcu_assign_pointer(array->ptrs[index], nsk);
+ free_osk = osk;
+ err = 0;
+
+put_file_unlock:
+ write_unlock_bh(&nsk->sk_callback_lock);
+
+ if (free_osk) {
+ write_lock_bh(&free_osk->sk_callback_lock);
+ WRITE_ONCE(free_osk->sk_user_data, NULL);
+ write_unlock_bh(&free_osk->sk_callback_lock);
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+put_file:
+ fput(socket->file);
+ return err;
+}
+
+/* Called from syscall */
+static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops reuseport_array_ops = {
+ .map_alloc_check = reuseport_array_alloc_check,
+ .map_alloc = reuseport_array_alloc,
+ .map_free = reuseport_array_free,
+ .map_lookup_elem = reuseport_array_lookup_elem,
+ .map_get_next_key = reuseport_array_get_next_key,
+ .map_delete_elem = reuseport_array_delete_elem,
+};
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c4d75c52b4fc..98e621a29e8e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -58,6 +58,7 @@ struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
struct bpf_sock_progs progs;
+ raw_spinlock_t lock;
};
struct bucket {
@@ -89,9 +90,9 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
+ struct bpf_map *map;
struct sock **entry;
struct htab_elem __rcu *hash_link;
- struct bpf_htab __rcu *htab;
};
struct smap_psock {
@@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
e = psock_map_pop(sk, psock);
while (e) {
if (e->entry) {
- osk = cmpxchg(e->entry, sk, NULL);
+ struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
+
+ raw_spin_lock_bh(&stab->lock);
+ osk = *e->entry;
if (osk == sk) {
+ *e->entry = NULL;
smap_release_sock(psock, sk);
}
+ raw_spin_unlock_bh(&stab->lock);
} else {
struct htab_elem *link = rcu_dereference(e->hash_link);
- struct bpf_htab *htab = rcu_dereference(e->htab);
+ struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
struct hlist_head *head;
struct htab_elem *l;
struct bucket *b;
@@ -370,6 +376,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
}
raw_spin_unlock_bh(&b->lock);
}
+ kfree(e);
e = psock_map_pop(sk, psock);
}
rcu_read_unlock();
@@ -725,11 +732,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
{
bool ingress = !!(md->flags & BPF_F_INGRESS);
struct smap_psock *psock;
- struct scatterlist *sg;
int err = 0;
- sg = md->sg_data;
-
rcu_read_lock();
psock = smap_psock_sk(sk);
if (unlikely(!psock))
@@ -1644,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr);
+ raw_spin_lock_init(&stab->lock);
/* make sure page count doesn't overflow */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
@@ -1678,8 +1683,10 @@ static void smap_list_map_remove(struct smap_psock *psock,
spin_lock_bh(&psock->maps_lock);
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry)
+ if (e->entry == entry) {
list_del(&e->list);
+ kfree(e);
+ }
}
spin_unlock_bh(&psock->maps_lock);
}
@@ -1693,8 +1700,10 @@ static void smap_list_hash_remove(struct smap_psock *psock,
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
struct htab_elem *c = rcu_dereference(e->hash_link);
- if (c == hash_link)
+ if (c == hash_link) {
list_del(&e->list);
+ kfree(e);
+ }
}
spin_unlock_bh(&psock->maps_lock);
}
@@ -1714,14 +1723,15 @@ static void sock_map_free(struct bpf_map *map)
* and a grace period expire to ensure psock is really safe to remove.
*/
rcu_read_lock();
+ raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct smap_psock *psock;
struct sock *sock;
- sock = xchg(&stab->sock_map[i], NULL);
+ sock = stab->sock_map[i];
if (!sock)
continue;
-
+ stab->sock_map[i] = NULL;
psock = smap_psock_sk(sock);
/* This check handles a racing sock event that can get the
* sk_callback_lock before this case but after xchg happens
@@ -1733,6 +1743,7 @@ static void sock_map_free(struct bpf_map *map)
smap_release_sock(psock, sock);
}
}
+ raw_spin_unlock_bh(&stab->lock);
rcu_read_unlock();
sock_map_remove_complete(stab);
@@ -1776,19 +1787,23 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- sock = xchg(&stab->sock_map[k], NULL);
+ raw_spin_lock_bh(&stab->lock);
+ sock = stab->sock_map[k];
+ stab->sock_map[k] = NULL;
+ raw_spin_unlock_bh(&stab->lock);
if (!sock)
return -EINVAL;
psock = smap_psock_sk(sock);
if (!psock)
- goto out;
-
- if (psock->bpf_parse)
+ return 0;
+ if (psock->bpf_parse) {
+ write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
+ write_unlock_bh(&sock->sk_callback_lock);
+ }
smap_list_map_remove(psock, &stab->sock_map[k]);
smap_release_sock(psock, sock);
-out:
return 0;
}
@@ -1824,11 +1839,9 @@ out:
static int __sock_map_ctx_update_elem(struct bpf_map *map,
struct bpf_sock_progs *progs,
struct sock *sock,
- struct sock **map_link,
void *key)
{
struct bpf_prog *verdict, *parse, *tx_msg;
- struct smap_psock_map_entry *e = NULL;
struct smap_psock *psock;
bool new = false;
int err = 0;
@@ -1901,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
new = true;
}
- if (map_link) {
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e) {
- err = -ENOMEM;
- goto out_free;
- }
- }
-
/* 3. At this point we have a reference to a valid psock that is
* running. Attach any BPF programs needed.
*/
@@ -1930,17 +1935,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
write_unlock_bh(&sock->sk_callback_lock);
}
- /* 4. Place psock in sockmap for use and stop any programs on
- * the old sock assuming its not the same sock we are replacing
- * it with. Because we can only have a single set of programs if
- * old_sock has a strp we can stop it.
- */
- if (map_link) {
- e->entry = map_link;
- spin_lock_bh(&psock->maps_lock);
- list_add_tail(&e->list, &psock->maps);
- spin_unlock_bh(&psock->maps_lock);
- }
return err;
out_free:
smap_release_sock(psock, sock);
@@ -1951,7 +1945,6 @@ out_progs:
}
if (tx_msg)
bpf_prog_put(tx_msg);
- kfree(e);
return err;
}
@@ -1961,36 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct bpf_sock_progs *progs = &stab->progs;
- struct sock *osock, *sock;
+ struct sock *osock, *sock = skops->sk;
+ struct smap_psock_map_entry *e;
+ struct smap_psock *psock;
u32 i = *(u32 *)key;
int err;
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
-
if (unlikely(i >= stab->map.max_entries))
return -E2BIG;
- sock = READ_ONCE(stab->sock_map[i]);
- if (flags == BPF_EXIST && !sock)
- return -ENOENT;
- else if (flags == BPF_NOEXIST && sock)
- return -EEXIST;
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e)
+ return -ENOMEM;
- sock = skops->sk;
- err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
- key);
+ err = __sock_map_ctx_update_elem(map, progs, sock, key);
if (err)
goto out;
- osock = xchg(&stab->sock_map[i], sock);
- if (osock) {
- struct smap_psock *opsock = smap_psock_sk(osock);
+ /* psock guaranteed to be present. */
+ psock = smap_psock_sk(sock);
+ raw_spin_lock_bh(&stab->lock);
+ osock = stab->sock_map[i];
+ if (osock && flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ if (!osock && flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
- smap_list_map_remove(opsock, &stab->sock_map[i]);
- smap_release_sock(opsock, osock);
+ e->entry = &stab->sock_map[i];
+ e->map = map;
+ spin_lock_bh(&psock->maps_lock);
+ list_add_tail(&e->list, &psock->maps);
+ spin_unlock_bh(&psock->maps_lock);
+
+ stab->sock_map[i] = sock;
+ if (osock) {
+ psock = smap_psock_sk(osock);
+ smap_list_map_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, osock);
}
+ raw_spin_unlock_bh(&stab->lock);
+ return 0;
+out_unlock:
+ smap_release_sock(psock, sock);
+ raw_spin_unlock_bh(&stab->lock);
out:
+ kfree(e);
return err;
}
@@ -2353,7 +2367,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
b = __select_bucket(htab, hash);
head = &b->head;
- err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+ err = __sock_map_ctx_update_elem(map, progs, sock, key);
if (err)
goto err;
@@ -2379,8 +2393,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
}
rcu_assign_pointer(e->hash_link, l_new);
- rcu_assign_pointer(e->htab,
- container_of(map, struct bpf_htab, map));
+ e->map = map;
spin_lock_bh(&psock->maps_lock);
list_add_tail(&e->list, &psock->maps);
spin_unlock_bh(&psock->maps_lock);
@@ -2501,6 +2514,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
.map_release_uref = sock_map_release,
+ .map_check_btf = map_check_no_btf,
};
const struct bpf_map_ops sock_hash_ops = {
@@ -2511,6 +2525,7 @@ const struct bpf_map_ops sock_hash_ops = {
.map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem,
.map_release_uref = sock_map_release,
+ .map_check_btf = map_check_no_btf,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b675a3f3d141..8061a439ef18 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = {
.map_lookup_elem = stack_map_lookup_elem,
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
};
static int __init stack_map_init(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b41c6cf2eb88..8339d81cba1d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
const struct bpf_map_ops bpf_map_offload_ops = {
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
+ .map_check_btf = map_check_no_btf,
};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -181,32 +182,60 @@ int bpf_map_precharge_memlock(u32 pages)
return 0;
}
-static int bpf_map_charge_memlock(struct bpf_map *map)
+static int bpf_charge_memlock(struct user_struct *user, u32 pages)
{
- struct user_struct *user = get_current_user();
- unsigned long memlock_limit;
+ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
+ atomic_long_sub(pages, &user->locked_vm);
+ return -EPERM;
+ }
+ return 0;
+}
- atomic_long_add(map->pages, &user->locked_vm);
+static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
+{
+ atomic_long_sub(pages, &user->locked_vm);
+}
- if (atomic_long_read(&user->locked_vm) > memlock_limit) {
- atomic_long_sub(map->pages, &user->locked_vm);
+static int bpf_map_init_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = get_current_user();
+ int ret;
+
+ ret = bpf_charge_memlock(user, map->pages);
+ if (ret) {
free_uid(user);
- return -EPERM;
+ return ret;
}
map->user = user;
- return 0;
+ return ret;
}
-static void bpf_map_uncharge_memlock(struct bpf_map *map)
+static void bpf_map_release_memlock(struct bpf_map *map)
{
struct user_struct *user = map->user;
-
- atomic_long_sub(map->pages, &user->locked_vm);
+ bpf_uncharge_memlock(user, map->pages);
free_uid(user);
}
+int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
+{
+ int ret;
+
+ ret = bpf_charge_memlock(map->user, pages);
+ if (ret)
+ return ret;
+ map->pages += pages;
+ return ret;
+}
+
+void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
+{
+ bpf_uncharge_memlock(map->user, pages);
+ map->pages -= pages;
+}
+
static int bpf_map_alloc_id(struct bpf_map *map)
{
int id;
@@ -256,7 +285,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
- bpf_map_uncharge_memlock(map);
+ bpf_map_release_memlock(map);
security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
@@ -427,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
+int map_check_no_btf(const struct bpf_map *map,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ return -ENOTSUPP;
+}
+
+static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ u32 btf_key_id, u32 btf_value_id)
+{
+ const struct btf_type *key_type, *value_type;
+ u32 key_size, value_size;
+ int ret = 0;
+
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+ if (!key_type || key_size != map->key_size)
+ return -EINVAL;
+
+ value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+ if (!value_type || value_size != map->value_size)
+ return -EINVAL;
+
+ if (map->ops->map_check_btf)
+ ret = map->ops->map_check_btf(map, key_type, value_type);
+
+ return ret;
+}
+
#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
@@ -461,8 +518,7 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- if (bpf_map_support_seq_show(map) &&
- (attr->btf_key_type_id || attr->btf_value_type_id)) {
+ if (attr->btf_key_type_id || attr->btf_value_type_id) {
struct btf *btf;
if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
@@ -476,8 +532,8 @@ static int map_create(union bpf_attr *attr)
goto free_map_nouncharge;
}
- err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
- attr->btf_value_type_id);
+ err = map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
if (err) {
btf_put(btf);
goto free_map_nouncharge;
@@ -492,7 +548,7 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
- err = bpf_map_charge_memlock(map);
+ err = bpf_map_init_memlock(map);
if (err)
goto free_map_sec;
@@ -515,7 +571,7 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
- bpf_map_uncharge_memlock(map);
+ bpf_map_release_memlock(map);
free_map_sec:
security_bpf_map_free(map);
free_map_nouncharge:
@@ -656,6 +712,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -762,6 +820,10 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ attr->flags);
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -929,6 +991,9 @@ static void free_used_maps(struct bpf_prog_aux *aux)
{
int i;
+ if (aux->cgroup_storage)
+ bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
+
for (i = 0; i < aux->used_map_cnt; i++)
bpf_map_put(aux->used_maps[i]);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 63aaac52a265..92246117d2b0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
@@ -2127,6 +2128,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ if (func_id != BPF_FUNC_get_local_storage)
+ goto error;
+ break;
/* devmap returns a pointer to a live net_device ifindex that we cannot
* allow to be modified from bpf side. So do not allow lookup elements
* for now.
@@ -2162,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_msg_redirect_hash)
goto error;
break;
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ if (func_id != BPF_FUNC_sk_select_reuseport)
+ goto error;
+ break;
default:
break;
}
@@ -2209,6 +2218,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
+ case BPF_FUNC_get_local_storage:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_sk_select_reuseport:
+ if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
+ goto error;
+ break;
default:
break;
}
@@ -2533,6 +2550,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
regs = cur_regs(env);
+
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (func_id == BPF_FUNC_get_local_storage &&
+ !register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -2545,8 +2572,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
+ fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ if (fn->ret_type == RET_PTR_TO_MAP_VALUE)
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+ else
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
@@ -3238,8 +3269,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
}
- /* check dest operand */
- err = check_reg_arg(env, insn->dst_reg, DST_OP);
+ /* check dest operand, mark as required later */
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
@@ -3265,6 +3296,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* case: R = imm
* remember the value we stored into this reg
*/
+ /* clear any state __mark_reg_known doesn't set */
+ mark_reg_unknown(env, regs, insn->dst_reg);
regs[insn->dst_reg].type = SCALAR_VALUE;
if (BPF_CLASS(insn->code) == BPF_ALU64) {
__mark_reg_known(regs + insn->dst_reg,
@@ -5054,7 +5087,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
- !bpf_offload_dev_match(prog, map)) {
+ !bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
}
@@ -5152,6 +5185,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
}
env->used_maps[env->used_map_cnt++] = map;
+ if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+ bpf_cgroup_storage_assign(env->prog, map)) {
+ verbose(env,
+ "only one cgroup storage is allowed\n");
+ fdput(f);
+ return -EBUSY;
+ }
+
fdput(f);
next_insn:
insn++;
@@ -5178,6 +5219,10 @@ static void release_maps(struct bpf_verifier_env *env)
{
int i;
+ if (env->prog->aux->cgroup_storage)
+ bpf_cgroup_storage_release(env->prog,
+ env->prog->aux->cgroup_storage);
+
for (i = 0; i < env->used_map_cnt; i++)
bpf_map_put(env->used_maps[i]);
}
@@ -5799,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
goto patch_call_imm;
}
- if (insn->imm == BPF_FUNC_redirect_map) {
- /* Note, we cannot use prog directly as imm as subsequent
- * rewrites would still change the prog pointer. The only
- * stable address we can use is aux, which also works with
- * prog clones during blinding.
- */
- u64 addr = (unsigned long)prog->aux;
- struct bpf_insn r4_ld[] = {
- BPF_LD_IMM64(BPF_REG_4, addr),
- *insn,
- };
- cnt = ARRAY_SIZE(r4_ld);
-
- new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- }
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index b3c557476a8d..9f8463afda9c 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map)
struct xsk_map *m = container_of(map, struct xsk_map, map);
int i;
+ bpf_clear_redirect_map(map);
synchronize_net();
for (i = 0; i < map->max_entries; i++) {
@@ -227,6 +228,5 @@ const struct bpf_map_ops xsk_map_ops = {
.map_lookup_elem = xsk_map_lookup_elem,
.map_update_elem = xsk_map_update_elem,
.map_delete_elem = xsk_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
};
-
-
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 77ff1cd6a252..75568fcf2180 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -8,6 +8,32 @@
#include <linux/list.h>
#include <linux/refcount.h>
+#define TRACE_CGROUP_PATH_LEN 1024
+extern spinlock_t trace_cgroup_path_lock;
+extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+
+/*
+ * cgroup_path() takes a spin lock. It is good practice not to take
+ * spin locks within trace point handlers, as they are mostly hidden
+ * from normal view. As cgroup_path() can take the kernfs_rename_lock
+ * spin lock, it is best to not call that function from the trace event
+ * handler.
+ *
+ * Note: trace_cgroup_##type##_enabled() is a static branch that will only
+ * be set when the trace event is enabled.
+ */
+#define TRACE_CGROUP_PATH(type, cgrp, ...) \
+ do { \
+ if (trace_cgroup_##type##_enabled()) { \
+ spin_lock(&trace_cgroup_path_lock); \
+ cgroup_path(cgrp, trace_cgroup_path, \
+ TRACE_CGROUP_PATH_LEN); \
+ trace_cgroup_##type(cgrp, trace_cgroup_path, \
+ ##__VA_ARGS__); \
+ spin_unlock(&trace_cgroup_path_lock); \
+ } \
+ } while (0)
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 8b4f0768efd6..51063e7a93c2 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -135,7 +135,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (task) {
ret = cgroup_migrate(task, false, &mgctx);
if (!ret)
- trace_cgroup_transfer_tasks(to, task, false);
+ TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
put_task_struct(task);
}
} while (task && !ret);
@@ -865,7 +865,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
- trace_cgroup_rename(cgrp);
+ TRACE_CGROUP_PATH(rename, cgrp);
mutex_unlock(&cgroup_mutex);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 077370bf8964..aae10baf1902 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -83,6 +83,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
+DEFINE_SPINLOCK(trace_cgroup_path_lock);
+char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
* grabbing cgroup_mutex.
@@ -2638,7 +2641,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
cgroup_migrate_finish(&mgctx);
if (!ret)
- trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
+ TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
return ret;
}
@@ -3557,7 +3560,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
- cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ cgroup_file_mode(cft),
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ 0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
@@ -4634,7 +4639,7 @@ static void css_release_work_fn(struct work_struct *work)
struct cgroup *tcgrp;
/* cgroup release path */
- trace_cgroup_release(cgrp);
+ TRACE_CGROUP_PATH(release, cgrp);
if (cgroup_on_dfl(cgrp))
cgroup_rstat_flush(cgrp);
@@ -4977,7 +4982,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
if (ret)
goto out_destroy;
- trace_cgroup_mkdir(cgrp);
+ TRACE_CGROUP_PATH(mkdir, cgrp);
/* let's create and online css's */
kernfs_activate(kn);
@@ -5165,9 +5170,8 @@ int cgroup_rmdir(struct kernfs_node *kn)
return 0;
ret = cgroup_destroy_locked(cgrp);
-
if (!ret)
- trace_cgroup_rmdir(cgrp);
+ TRACE_CGROUP_PATH(rmdir, cgrp);
cgroup_kn_unlock(kn);
return ret;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index dd8634dde1ae..ed44d7d34c2d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -60,6 +60,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
+ bool booted_once;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -290,6 +291,12 @@ void cpus_read_lock(void)
}
EXPORT_SYMBOL_GPL(cpus_read_lock);
+int cpus_read_trylock(void)
+{
+ return percpu_down_read_trylock(&cpu_hotplug_lock);
+}
+EXPORT_SYMBOL_GPL(cpus_read_trylock);
+
void cpus_read_unlock(void)
{
percpu_up_read(&cpu_hotplug_lock);
@@ -342,6 +349,85 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
+#ifdef CONFIG_HOTPLUG_SMT
+enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+EXPORT_SYMBOL_GPL(cpu_smt_control);
+
+static bool cpu_smt_available __read_mostly;
+
+void __init cpu_smt_disable(bool force)
+{
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return;
+
+ if (force) {
+ pr_info("SMT: Force disabled\n");
+ cpu_smt_control = CPU_SMT_FORCE_DISABLED;
+ } else {
+ cpu_smt_control = CPU_SMT_DISABLED;
+ }
+}
+
+/*
+ * The decision whether SMT is supported can only be done after the full
+ * CPU identification. Called from architecture code before non boot CPUs
+ * are brought up.
+ */
+void __init cpu_smt_check_topology_early(void)
+{
+ if (!topology_smt_supported())
+ cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
+/*
+ * If SMT was disabled by BIOS, detect it here, after the CPUs have been
+ * brought online. This ensures the smt/l1tf sysfs entries are consistent
+ * with reality. cpu_smt_available is set to true during the bringup of non
+ * boot CPUs when a SMT sibling is detected. Note, this may overwrite
+ * cpu_smt_control's previous setting.
+ */
+void __init cpu_smt_check_topology(void)
+{
+ if (!cpu_smt_available)
+ cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
+static int __init smt_cmdline_disable(char *str)
+{
+ cpu_smt_disable(str && !strcmp(str, "force"));
+ return 0;
+}
+early_param("nosmt", smt_cmdline_disable);
+
+static inline bool cpu_smt_allowed(unsigned int cpu)
+{
+ if (topology_is_primary_thread(cpu))
+ return true;
+
+ /*
+ * If the CPU is not a 'primary' thread and the booted_once bit is
+ * set then the processor has SMT support. Store this information
+ * for the late check of SMT support in cpu_smt_check_topology().
+ */
+ if (per_cpu(cpuhp_state, cpu).booted_once)
+ cpu_smt_available = true;
+
+ if (cpu_smt_control == CPU_SMT_ENABLED)
+ return true;
+
+ /*
+ * On x86 it's required to boot all logical CPUs at least once so
+ * that the init code can get a chance to set CR4.MCE on each
+ * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
+ * core will shutdown the machine.
+ */
+ return !per_cpu(cpuhp_state, cpu).booted_once;
+}
+#else
+static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+#endif
+
static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
@@ -422,6 +508,16 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
+ /*
+ * SMT soft disabling on X86 requires to bring the CPU out of the
+ * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
+ * CPU marked itself as booted_once in cpu_notify_starting() so the
+ * cpu_smt_allowed() check will now return false if this is not the
+ * primary sibling.
+ */
+ if (!cpu_smt_allowed(cpu))
+ return -ECANCELED;
+
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
@@ -754,7 +850,6 @@ static int takedown_cpu(unsigned int cpu)
/* Park the smpboot threads */
kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
- smpboot_park_threads(cpu);
/*
* Prevent irq alloc/free while the dying cpu reorganizes the
@@ -907,20 +1002,19 @@ out:
return ret;
}
+static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
+{
+ if (cpu_hotplug_disabled)
+ return -EBUSY;
+ return _cpu_down(cpu, 0, target);
+}
+
static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
{
int err;
cpu_maps_update_begin();
-
- if (cpu_hotplug_disabled) {
- err = -EBUSY;
- goto out;
- }
-
- err = _cpu_down(cpu, 0, target);
-
-out:
+ err = cpu_down_maps_locked(cpu, target);
cpu_maps_update_done();
return err;
}
@@ -949,6 +1043,7 @@ void notify_cpu_starting(unsigned int cpu)
int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
+ st->booted_once = true;
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
@@ -1058,6 +1153,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
err = -EBUSY;
goto out;
}
+ if (!cpu_smt_allowed(cpu)) {
+ err = -EPERM;
+ goto out;
+ }
err = _cpu_up(cpu, 0, target);
out:
@@ -1332,7 +1431,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
.startup.single = smpboot_unpark_threads,
- .teardown.single = NULL,
+ .teardown.single = smpboot_park_threads,
},
[CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
.name = "irq/affinity:online",
@@ -1911,10 +2010,172 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
NULL
};
+#ifdef CONFIG_HOTPLUG_SMT
+
+static const char *smt_states[] = {
+ [CPU_SMT_ENABLED] = "on",
+ [CPU_SMT_DISABLED] = "off",
+ [CPU_SMT_FORCE_DISABLED] = "forceoff",
+ [CPU_SMT_NOT_SUPPORTED] = "notsupported",
+};
+
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
+}
+
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+static int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ctrlval, ret;
+
+ if (sysfs_streq(buf, "on"))
+ ctrlval = CPU_SMT_ENABLED;
+ else if (sysfs_streq(buf, "off"))
+ ctrlval = CPU_SMT_DISABLED;
+ else if (sysfs_streq(buf, "forceoff"))
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ else
+ return -EINVAL;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+ return -EPERM;
+
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return -ENODEV;
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+
+ if (ctrlval != cpu_smt_control) {
+ switch (ctrlval) {
+ case CPU_SMT_ENABLED:
+ ret = cpuhp_smt_enable();
+ break;
+ case CPU_SMT_DISABLED:
+ case CPU_SMT_FORCE_DISABLED:
+ ret = cpuhp_smt_disable(ctrlval);
+ break;
+ }
+ }
+
+ unlock_device_hotplug();
+ return ret ? ret : count;
+}
+static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+
+static ssize_t
+show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bool active = topology_max_smt_threads() > 1;
+
+ return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
+}
+static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+
+static struct attribute *cpuhp_smt_attrs[] = {
+ &dev_attr_control.attr,
+ &dev_attr_active.attr,
+ NULL
+};
+
+static const struct attribute_group cpuhp_smt_attr_group = {
+ .attrs = cpuhp_smt_attrs,
+ .name = "smt",
+ NULL
+};
+
+static int __init cpu_smt_state_init(void)
+{
+ return sysfs_create_group(&cpu_subsys.dev_root->kobj,
+ &cpuhp_smt_attr_group);
+}
+
+#else
+static inline int cpu_smt_state_init(void) { return 0; }
+#endif
+
static int __init cpuhp_sysfs_init(void)
{
int cpu, ret;
+ ret = cpu_smt_state_init();
+ if (ret)
+ return ret;
+
ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_cpu_root_attr_group);
if (ret)
@@ -2017,5 +2278,8 @@ void __init boot_cpu_init(void)
*/
void __init boot_cpu_hotplug_init(void)
{
- per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
+#ifdef CONFIG_SMP
+ this_cpu_write(cpuhp_state.booted_once, true);
+#endif
+ this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..933cb3e45b98 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
#include <asm/sections.h>
/* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
@@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void)
if (vmcoreinfo_data_safecopy)
vmcoreinfo_data = vmcoreinfo_data_safecopy;
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
update_vmcoreinfo_note();
}
@@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
+ VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d987dcd1bd56..286d82329eb0 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
* @dev: Pointer to device for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
- * @gfp_mask: GFP flags to use for this allocation.
+ * @no_warn: Avoid printing message about failed allocation.
*
* This function allocates memory buffer for specified device. It uses
* device specific contiguous memory area if available or the default
@@ -186,12 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
* function.
*/
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int align, gfp_t gfp_mask)
+ unsigned int align, bool no_warn)
{
if (align > CONFIG_CMA_ALIGNMENT)
align = CONFIG_CMA_ALIGNMENT;
- return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask);
+ return cma_alloc(dev_get_cma_area(dev), count, align, no_warn);
}
/**
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8be8106270c2..1c35b7b945d0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
again:
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp)) {
- page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, page_order,
+ gfp & __GFP_NOWARN);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_release_from_contiguous(dev, page, count);
page = NULL;
@@ -180,10 +181,10 @@ int dma_direct_supported(struct device *dev, u64 mask)
return 0;
#endif
/*
- * Various PCI/PCIe bridges have broken support for > 32bit DMA even
- * if the device itself might support it.
+ * Upstream PCI/PCIe bridges or SoC interconnects may not carry
+ * as many DMA address bits as the device itself supports.
*/
- if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+ if (dev->bus_dma_mask && mask > dev->bus_dma_mask)
return 0;
return 1;
}
diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c
index 79e9a757387f..031fe235d958 100644
--- a/kernel/dma/noncoherent.c
+++ b/kernel/dma/noncoherent.c
@@ -49,11 +49,13 @@ static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
return nents;
}
-#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+ arch_sync_dma_for_cpu_all(dev);
}
static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
@@ -64,6 +66,7 @@ static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
for_each_sg(sgl, sg, nents, i)
arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+ arch_sync_dma_for_cpu_all(dev);
}
static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
@@ -89,7 +92,8 @@ const struct dma_map_ops dma_noncoherent_ops = {
.sync_sg_for_device = dma_noncoherent_sync_sg_for_device,
.map_page = dma_noncoherent_map_page,
.map_sg = dma_noncoherent_map_sg,
-#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
.sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu,
.sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu,
.unmap_page = dma_noncoherent_unmap_page,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 904541055792..4f8a6dbf0b60 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -17,6 +17,8 @@
* 08/12/11 beckyb Add highmem support
*/
+#define pr_fmt(fmt) "software IO TLB: " fmt
+
#include <linux/cache.h>
#include <linux/dma-direct.h>
#include <linux/mm.h>
@@ -162,20 +164,16 @@ static bool no_iotlb_memory;
void swiotlb_print_info(void)
{
unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
- unsigned char *vstart, *vend;
if (no_iotlb_memory) {
- pr_warn("software IO TLB: No low mem\n");
+ pr_warn("No low mem\n");
return;
}
- vstart = phys_to_virt(io_tlb_start);
- vend = phys_to_virt(io_tlb_end);
-
- printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
+ pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n",
(unsigned long long)io_tlb_start,
(unsigned long long)io_tlb_end,
- bytes >> 20, vstart, vend - 1);
+ bytes >> 20);
}
/*
@@ -275,7 +273,7 @@ swiotlb_init(int verbose)
if (io_tlb_start)
memblock_free_early(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
- pr_warn("Cannot allocate SWIOTLB buffer");
+ pr_warn("Cannot allocate buffer");
no_iotlb_memory = true;
}
@@ -317,8 +315,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
return -ENOMEM;
}
if (order != get_order(bytes)) {
- printk(KERN_WARNING "Warning: only able to allocate %ld MB "
- "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+ pr_warn("only able to allocate %ld MB\n",
+ (PAGE_SIZE << order) >> 20);
io_tlb_nslabs = SLABS_PER_PAGE << order;
}
rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c187aa3df3c8..24a77c34e9ad 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -4,7 +4,7 @@
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
- * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f6ea33a9f904..2a62b96600ad 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1334,7 +1334,7 @@ static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+ return perf_event_pid_type(event, p, PIDTYPE_TGID);
}
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
@@ -5246,8 +5246,8 @@ void perf_event_update_userpage(struct perf_event *event)
userpg = rb->user_page;
/*
- * Disable preemption so as to not let the corresponding user-space
- * spin too long if we get preempted.
+ * Disable preemption to guarantee consistent time stamps are stored to
+ * the user page.
*/
preempt_disable();
++userpg->lock;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index aed1ba569954..3207a4d26849 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -299,8 +299,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* Called with mm->mmap_sem held for write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
- uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr, uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
struct vm_area_struct *vma;
@@ -351,7 +351,7 @@ put_old:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -366,7 +366,8 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+ return uprobe_write_opcode(auprobe, mm, vaddr,
+ *(uprobe_opcode_t *)&auprobe->insn);
}
static struct uprobe *get_uprobe(struct uprobe *uprobe)
@@ -840,13 +841,8 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
-static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
- consumer_add(uprobe, uc);
- return register_for_each_vma(uprobe, uc);
-}
-
-static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void
+__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
@@ -860,24 +856,46 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
}
/*
- * uprobe_register - register a probe
+ * uprobe_unregister - unregister an already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return;
+
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
+
+/*
+ * __uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
* @uc: information on howto handle the probe..
*
- * Apart from the access refcount, uprobe_register() takes a creation
+ * Apart from the access refcount, __uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of uprobe_register() is required to keep @inode
+ * unregisters. Caller of __uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
* Return errno if it cannot successully install probes
* else return 0 (success)
*/
-int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+static int __uprobe_register(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
@@ -904,7 +922,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
down_write(&uprobe->register_rwsem);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
- ret = __uprobe_register(uprobe, uc);
+ consumer_add(uprobe, uc);
+ ret = register_for_each_vma(uprobe, uc);
if (ret)
__uprobe_unregister(uprobe, uc);
}
@@ -915,6 +934,12 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
goto retry;
return ret;
}
+
+int uprobe_register(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc)
+{
+ return __uprobe_register(inode, offset, uc);
+}
EXPORT_SYMBOL_GPL(uprobe_register);
/*
@@ -946,27 +971,6 @@ int uprobe_apply(struct inode *inode, loff_t offset,
return ret;
}
-/*
- * uprobe_unregister - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
- * @uc: identify which probe if multiple probes are colocated.
- */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- struct uprobe *uprobe;
-
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return;
-
- down_write(&uprobe->register_rwsem);
- __uprobe_unregister(uprobe, uc);
- up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
-}
-EXPORT_SYMBOL_GPL(uprobe_unregister);
-
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
struct vm_area_struct *vma;
diff --git a/kernel/exit.c b/kernel/exit.c
index c3c7ac560114..0e21e6d21f35 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -73,6 +73,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
nr_threads--;
detach_pid(p, PIDTYPE_PID);
if (group_dead) {
+ detach_pid(p, PIDTYPE_TGID);
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
@@ -680,7 +681,8 @@ static void forget_original_parent(struct task_struct *father,
t->parent = t->real_parent;
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
- SEND_SIG_NOINFO, t);
+ SEND_SIG_NOINFO, t,
+ PIDTYPE_TGID);
}
/*
* If this is a threaded reparent there is no need to
@@ -1001,14 +1003,6 @@ struct wait_opts {
int notask_error;
};
-static inline
-struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
-{
- if (type != PIDTYPE_PID)
- task = task->group_leader;
- return task->pids[type].pid;
-}
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d8d0e016fc6..d896e9ca38b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep;
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
- struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ struct vm_area_struct *vma;
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (vma)
vma_init(vma, mm);
return vma;
@@ -866,6 +867,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->fail_nth = 0;
#endif
+#ifdef CONFIG_BLK_CGROUP
+ tsk->throttle_queue = NULL;
+ tsk->use_memdelay = 0;
+#endif
+
+#ifdef CONFIG_MEMCG
+ tsk->active_memcg = NULL;
+#endif
return tsk;
free_stack:
@@ -1293,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+ tsk->last_switch_time = 0;
#endif
tsk->mm = NULL;
@@ -1417,7 +1427,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
atomic_set(&sig->count, 1);
+ spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ spin_unlock_irq(&current->sighand->siglock);
return 0;
}
@@ -1479,6 +1491,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
+ INIT_HLIST_HEAD(&sig->multiprocess);
seqlock_init(&sig->stats_lock);
prev_cputime_init(&sig->prev_cputime);
@@ -1572,10 +1585,22 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
#endif
+static inline void init_task_pid_links(struct task_struct *task)
+{
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ INIT_HLIST_NODE(&task->pid_links[type]);
+ }
+}
+
static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
- task->pids[type].pid = pid;
+ if (type == PIDTYPE_PID)
+ task->thread_pid = pid;
+ else
+ task->signal->pids[type] = pid;
}
static inline void rcu_copy_process(struct task_struct *p)
@@ -1613,6 +1638,7 @@ static __latent_entropy struct task_struct *copy_process(
{
int retval;
struct task_struct *p;
+ struct multiprocess_signals delayed;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1660,6 +1686,24 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ /*
+ * Force any signals received before this point to be delivered
+ * before the fork happens. Collect up signals sent to multiple
+ * processes that happen during the fork and delay them so that
+ * they appear to happen after the fork.
+ */
+ sigemptyset(&delayed.signal);
+ INIT_HLIST_NODE(&delayed.node);
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (!(clone_flags & CLONE_THREAD))
+ hlist_add_head(&delayed.node, &current->signal->multiprocess);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ retval = -ERESTARTNOINTR;
+ if (signal_pending(current))
+ goto fork_out;
+
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
@@ -1933,29 +1977,26 @@ static __latent_entropy struct task_struct *copy_process(
rseq_fork(p, clone_flags);
- /*
- * Process group and session signals need to be delivered to just the
- * parent before the fork or both the parent and the child after the
- * fork. Restart if a signal comes in before we add the new process to
- * it's process group.
- * A fatal signal pending means that current will exit, so the new
- * thread can't slip out of an OOM kill (or normal SIGKILL).
- */
- recalc_sigpending();
- if (signal_pending(current)) {
- retval = -ERESTARTNOINTR;
- goto bad_fork_cancel_cgroup;
- }
+ /* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
+ /* Let kill terminate clone/fork in the middle */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto bad_fork_cancel_cgroup;
+ }
+
+
+ init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
+ init_task_pid(p, PIDTYPE_TGID, pid);
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
@@ -1963,8 +2004,7 @@ static __latent_entropy struct task_struct *copy_process(
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
-
- p->signal->leader_pid = pid;
+ p->signal->shared_pending.signal = delayed.signal;
p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
@@ -1975,6 +2015,7 @@ static __latent_entropy struct task_struct *copy_process(
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
@@ -1982,6 +2023,7 @@ static __latent_entropy struct task_struct *copy_process(
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
atomic_inc(&current->signal->sigcnt);
+ task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
@@ -1990,8 +2032,8 @@ static __latent_entropy struct task_struct *copy_process(
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
-
total_forks++;
+ hlist_del_init(&delayed.node);
spin_unlock(&current->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
@@ -2056,16 +2098,19 @@ bad_fork_free:
put_task_stack(p);
free_task(p);
fork_out:
+ spin_lock_irq(&current->sighand->siglock);
+ hlist_del_init(&delayed.node);
+ spin_unlock_irq(&current->sighand->siglock);
return ERR_PTR(retval);
}
-static inline void init_idle_pids(struct pid_link *links)
+static inline void init_idle_pids(struct task_struct *idle)
{
enum pid_type type;
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
- INIT_HLIST_NODE(&links[type].node); /* not really needed */
- links[type].pid = &init_struct_pid;
+ INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
+ init_task_pid(idle, type, &init_struct_pid);
}
}
@@ -2075,7 +2120,7 @@ struct task_struct *fork_idle(int cpu)
task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
cpu_to_node(cpu));
if (!IS_ERR(task)) {
- init_idle_pids(task->pids);
+ init_idle_pids(task);
init_idle(task, cpu);
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 6f56a9e219fa..b162b74611e4 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -15,7 +15,9 @@
atomic_t system_freezing_cnt = ATOMIC_INIT(0);
EXPORT_SYMBOL(system_freezing_cnt);
-/* indicate whether PM freezing is in effect, protected by pm_mutex */
+/* indicate whether PM freezing is in effect, protected by
+ * system_transition_mutex
+ */
bool pm_freezing;
bool pm_nosig_freezing;
diff --git a/kernel/futex.c b/kernel/futex.c
index 1f450e092c74..11fc3bb456d6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3523,10 +3523,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
+ /* fall through */
case FUTEX_WAIT_BITSET:
return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
+ /* fall through */
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 32b479468e4d..b9132d1269ef 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
+/*
+ * Zero (default value) means use sysctl_hung_task_timeout_secs:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+
int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
@@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
+ t->last_switch_time = jiffies;
return;
}
+ if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return;
trace_sched_process_hang(t);
@@ -245,8 +253,13 @@ static int watchdog(void *dummy)
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
- long t = hung_timeout_jiffies(hung_last_checked, timeout);
+ unsigned long interval = sysctl_hung_task_check_interval_secs;
+ long t;
+ if (interval == 0)
+ interval = timeout;
+ interval = min_t(unsigned long, interval, timeout);
+ t = hung_timeout_jiffies(hung_last_checked, interval);
if (t <= 0) {
if (!atomic_xchg(&reset_hung_task, 0))
check_hung_uninterruptible_tasks(timeout);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index aed8fb2564b3..68559808fdfa 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -11,6 +11,7 @@
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
+#include <linux/security.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
@@ -195,10 +196,17 @@ out:
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
+ int result;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
+ /* Permit LSMs and IMA to fail the kexec */
+ result = security_kernel_load_data(LOADING_KEXEC_IMAGE);
+ if (result < 0)
+ return result;
+
/*
* Verify we have a legal set of flags
* This leaves us room for future extensions.
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3a4656fb7047..5b77a7314e01 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -678,6 +678,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
if (!func->old_name || !func->new_func)
return -EINVAL;
+ if (strlen(func->old_name) >= KSYM_NAME_LEN)
+ return -EINVAL;
+
INIT_LIST_HEAD(&func->stack_node);
func->patched = false;
func->transition = false;
@@ -751,6 +754,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
if (!obj->funcs)
return -EINVAL;
+ if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN)
+ return -EINVAL;
+
obj->patched = false;
obj->mod = NULL;
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 7c6631e693bc..5bc349805e03 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -310,13 +310,6 @@ static bool klp_try_switch_task(struct task_struct *task)
return true;
/*
- * For arches which don't have reliable stack traces, we have to rely
- * on other methods (e.g., switching tasks at kernel exit).
- */
- if (!klp_have_reliable_stack())
- return false;
-
- /*
* Now try to check the stack for any to-be-patched or to-be-unpatched
* functions. If all goes well, switch the task to the target patch
* state.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5fa4d3138bf1..e406c5fdb41e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,6 +55,7 @@
#include "lockdep_internals.h"
+#include <trace/events/preemptirq.h>
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
@@ -248,12 +249,7 @@ void clear_lock_stats(struct lock_class *class)
static struct lock_class_stats *get_lock_stats(struct lock_class *class)
{
- return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
-}
-
-static void put_lock_stats(struct lock_class_stats *stats)
-{
- put_cpu_var(cpu_lock_stats);
+ return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes];
}
static void lock_release_holdtime(struct held_lock *hlock)
@@ -271,7 +267,6 @@ static void lock_release_holdtime(struct held_lock *hlock)
lock_time_inc(&stats->read_holdtime, holdtime);
else
lock_time_inc(&stats->write_holdtime, holdtime);
- put_lock_stats(stats);
}
#else
static inline void lock_release_holdtime(struct held_lock *hlock)
@@ -2845,10 +2840,8 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
debug_atomic_inc(hardirqs_on_events);
}
-__visible void trace_hardirqs_on_caller(unsigned long ip)
+void lockdep_hardirqs_on(unsigned long ip)
{
- time_hardirqs_on(CALLER_ADDR0, ip);
-
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -2887,23 +2880,14 @@ __visible void trace_hardirqs_on_caller(unsigned long ip)
__trace_hardirqs_on_caller(ip);
current->lockdep_recursion = 0;
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-
-void trace_hardirqs_on(void)
-{
- trace_hardirqs_on_caller(CALLER_ADDR0);
-}
-EXPORT_SYMBOL(trace_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-__visible void trace_hardirqs_off_caller(unsigned long ip)
+void lockdep_hardirqs_off(unsigned long ip)
{
struct task_struct *curr = current;
- time_hardirqs_off(CALLER_ADDR0, ip);
-
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -2925,13 +2909,6 @@ __visible void trace_hardirqs_off_caller(unsigned long ip)
} else
debug_atomic_inc(redundant_hardirqs_off);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-
-void trace_hardirqs_off(void)
-{
- trace_hardirqs_off_caller(CALLER_ADDR0);
-}
-EXPORT_SYMBOL(trace_hardirqs_off);
/*
* Softirqs will be enabled:
@@ -4090,7 +4067,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
stats->contending_point[contending_point]++;
if (lock->cpu != smp_processor_id())
stats->bounces[bounce_contended + !!hlock->read]++;
- put_lock_stats(stats);
}
static void
@@ -4138,7 +4114,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
}
if (lock->cpu != cpu)
stats->bounces[bounce_acquired + !!hlock->read]++;
- put_lock_stats(stats);
lock->cpu = cpu;
lock->ip = ip;
@@ -4338,7 +4313,7 @@ out_restore:
raw_local_irq_restore(flags);
}
-void __init lockdep_info(void)
+void __init lockdep_init(void)
{
printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 57bef4fbfb31..7d0b0ed74404 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -368,7 +368,7 @@ static struct lock_torture_ops mutex_lock_ops = {
};
#include <linux/ww_mutex.h>
-static DEFINE_WW_CLASS(torture_ww_class);
+static DEFINE_WD_CLASS(torture_ww_class);
static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index f44f658ae629..1a81a1257b3f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -174,6 +174,21 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
}
/*
+ * Add @waiter to a given location in the lock wait_list and set the
+ * FLAG_WAITERS flag if it's the first waiter.
+ */
+static void __sched
+__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct list_head *list)
+{
+ debug_mutex_add_waiter(lock, waiter, current);
+
+ list_add_tail(&waiter->list, list);
+ if (__mutex_waiter_is_first(lock, waiter))
+ __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+}
+
+/*
* Give up ownership to a specific task, when @task = NULL, this is equivalent
* to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
* WAITERS. Provides RELEASE semantics like a regular unlock, the
@@ -244,6 +259,22 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
static __always_inline void
ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
{
@@ -282,26 +313,108 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
#endif
ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
}
+/*
+ * Determine if context @a is 'after' context @b. IOW, @a is a younger
+ * transaction than @b and depending on algorithm either needs to wait for
+ * @b or die.
+ */
static inline bool __sched
__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
{
- return a->stamp - b->stamp <= LONG_MAX &&
- (a->stamp != b->stamp || a > b);
+
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a younger waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool __sched
+__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 &&
+ __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) {
+ debug_mutex_wake_waiter(lock, waiter);
+ wake_up_process(waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a younger @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with older transactions than
+ * the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx)
+{
+ struct task_struct *owner = __mutex_owner(lock);
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_lock_check_stamp() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current)
+ wake_up_process(owner);
+
+ return true;
+ }
+
+ return false;
}
/*
- * Wake up any waiters that may have to back off when the lock is held by the
- * given context.
+ * We just acquired @lock under @ww_ctx, if there are later contexts waiting
+ * behind us on the wait-list, check if they need to die, or wound us.
*
- * Due to the invariants on the wait list, this can only affect the first
- * waiter with a context.
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
*
* The current task must not be on the wait list.
*/
static void __sched
-__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
{
struct mutex_waiter *cur;
@@ -311,66 +424,51 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
if (!cur->ww_ctx)
continue;
- if (cur->ww_ctx->acquired > 0 &&
- __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
-
- break;
+ if (__ww_mutex_die(lock, cur, ww_ctx) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx))
+ break;
}
}
/*
- * After acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
*/
static __always_inline void
ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
ww_mutex_lock_acquired(lock, ctx);
- lock->ctx = ctx;
-
/*
* The lock->ctx update should be visible on all cores before
- * the atomic read is done, otherwise contended waiters might be
+ * the WAITERS check is done, otherwise contended waiters might be
* missed. The contended waiters will either see ww_ctx == NULL
* and keep spinning, or it will acquire wait_lock, add itself
* to waiter list and sleep.
*/
- smp_mb(); /* ^^^ */
+ smp_mb(); /* See comments above and below. */
/*
- * Check if lock is contended, if not there is nobody to wake up
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
*/
if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
return;
/*
- * Uh oh, we raced in fastpath, wake up everyone in this case,
- * so they can see the new lock->ctx.
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
*/
spin_lock(&lock->base.wait_lock);
- __ww_mutex_wakeup_for_backoff(&lock->base, ctx);
+ __ww_mutex_check_waiters(&lock->base, ctx);
spin_unlock(&lock->base.wait_lock);
}
-/*
- * After acquiring lock in the slowpath set ctx.
- *
- * Unlike for the fast path, the caller ensures that waiters are woken up where
- * necessary.
- *
- * Callers must hold the mutex wait_lock.
- */
-static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
- ww_mutex_lock_acquired(lock, ctx);
- lock->ctx = ctx;
-}
-
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline
@@ -646,37 +744,83 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
}
EXPORT_SYMBOL(ww_mutex_unlock);
+
+static __always_inline int __sched
+__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter,
- struct ww_acquire_ctx *ctx)
+__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter,
+ struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
struct mutex_waiter *cur;
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
- goto deadlock;
+ return __ww_mutex_kill(lock, ctx);
/*
* If there is a waiter in front of us that has a context, then its
- * stamp is earlier than ours and we must back off.
+ * stamp is earlier than ours and we must kill ourself.
*/
cur = waiter;
list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
- if (cur->ww_ctx)
- goto deadlock;
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
}
return 0;
-
-deadlock:
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
- ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
}
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
static inline int __sched
__ww_mutex_add_waiter(struct mutex_waiter *waiter,
struct mutex *lock,
@@ -684,16 +828,21 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
{
struct mutex_waiter *cur;
struct list_head *pos;
+ bool is_wait_die;
if (!ww_ctx) {
- list_add_tail(&waiter->list, &lock->wait_list);
+ __mutex_add_waiter(lock, waiter, &lock->wait_list);
return 0;
}
+ is_wait_die = ww_ctx->is_wait_die;
+
/*
* Add the waiter before the first waiter with a higher stamp.
* Waiters without a context are skipped to avoid starving
- * them.
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
*/
pos = &lock->wait_list;
list_for_each_entry_reverse(cur, &lock->wait_list, list) {
@@ -701,16 +850,16 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
continue;
if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
- /* Back off immediately if necessary. */
- if (ww_ctx->acquired > 0) {
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
- ww_ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
}
break;
@@ -718,17 +867,28 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
pos = &cur->list;
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx);
+ }
+
+ __mutex_add_waiter(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
/*
- * Wake up the waiter so that it gets a chance to back
- * off.
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
*/
- if (cur->ww_ctx->acquired > 0) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx);
}
- list_add_tail(&waiter->list, pos);
return 0;
}
@@ -751,6 +911,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
if (use_ww_ctx && ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
}
preempt_disable();
@@ -772,7 +940,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
if (__mutex_trylock(lock)) {
if (use_ww_ctx && ww_ctx)
- __ww_mutex_wakeup_for_backoff(lock, ww_ctx);
+ __ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
}
@@ -784,25 +952,26 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
if (!use_ww_ctx) {
/* add waiting tasks to the end of the waitqueue (FIFO): */
- list_add_tail(&waiter.list, &lock->wait_list);
+ __mutex_add_waiter(lock, &waiter, &lock->wait_list);
+
#ifdef CONFIG_DEBUG_MUTEXES
waiter.ww_ctx = MUTEX_POISON_WW_CTX;
#endif
} else {
- /* Add in stamp order, waking up waiters that must back off. */
+ /*
+ * Add in stamp order, waking up waiters that must kill
+ * themselves.
+ */
ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
if (ret)
- goto err_early_backoff;
+ goto err_early_kill;
waiter.ww_ctx = ww_ctx;
}
waiter.task = current;
- if (__mutex_waiter_is_first(lock, &waiter))
- __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
-
set_current_state(state);
for (;;) {
/*
@@ -815,7 +984,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto acquired;
/*
- * Check for signals and wound conditions while holding
+ * Check for signals and kill conditions while holding
* wait_lock. This ensures the lock cancellation is ordered
* against mutex_unlock() and wake-ups do not go missing.
*/
@@ -824,8 +993,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
- ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx);
+ if (use_ww_ctx && ww_ctx) {
+ ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
}
@@ -859,6 +1028,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
acquired:
__set_current_state(TASK_RUNNING);
+ if (use_ww_ctx && ww_ctx) {
+ /*
+ * Wound-Wait; we stole the lock (!first_waiter), check the
+ * waiters as anyone might want to wound us.
+ */
+ if (!ww_ctx->is_wait_die &&
+ !__mutex_waiter_is_first(lock, &waiter))
+ __ww_mutex_check_waiters(lock, ww_ctx);
+ }
+
mutex_remove_waiter(lock, &waiter, current);
if (likely(list_empty(&lock->wait_list)))
__mutex_clear_flag(lock, MUTEX_FLAGS);
@@ -870,7 +1049,7 @@ skip_wait:
lock_acquired(&lock->dep_map, ip);
if (use_ww_ctx && ww_ctx)
- ww_mutex_set_context_slowpath(ww, ww_ctx);
+ ww_mutex_lock_acquired(ww, ww_ctx);
spin_unlock(&lock->wait_lock);
preempt_enable();
@@ -879,7 +1058,7 @@ skip_wait:
err:
__set_current_state(TASK_RUNNING);
mutex_remove_waiter(lock, &waiter, current);
-err_early_backoff:
+err_early_kill:
spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 0e4cd64ad2c0..5b915b370d5a 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -26,7 +26,7 @@
#include <linux/slab.h>
#include <linux/ww_mutex.h>
-static DEFINE_WW_CLASS(ww_class);
+static DEFINE_WD_CLASS(ww_class);
struct workqueue_struct *wq;
struct test_mutex {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 38283363da06..5b8600d39931 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
#include <linux/swap.h>
@@ -42,7 +43,7 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff)
pgoff += 1UL << order, order = order_at((res), pgoff))
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-int device_private_entry_fault(struct vm_area_struct *vma,
+vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
swp_entry_t entry,
unsigned int flags,
@@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data)
mem_hotplug_begin();
arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
&pgmap->altmap : NULL);
+ kasan_remove_zero_shadow(__va(align_start), align_size);
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_pfn_remap;
mem_hotplug_begin();
+ error = kasan_add_zero_shadow(__va(align_start), align_size);
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
error = arch_add_memory(nid, align_start, align_size, altmap, false);
if (!error)
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
@@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
return __va(res->start);
err_add_memory:
+ kasan_remove_zero_shadow(__va(align_start), align_size);
+ err_kasan:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
err_radix:
@@ -355,7 +365,6 @@ void __put_devmap_managed_page(struct page *page)
__ClearPageActive(page);
__ClearPageWaiters(page);
- page->mapping = NULL;
mem_cgroup_uncharge(page);
page->pgmap->page_free(page, page->pgmap->data);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 915e123a430f..79c9be2dbbe9 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -9,4 +9,27 @@
* 2 of the Licence, or (at your option) any later version.
*/
-extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
+#include <linux/elf.h>
+#include <asm/module.h>
+
+struct load_info {
+ const char *name;
+ /* pointer to module in temporary copy, freed at end of load_module() */
+ struct module *mod;
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs;
+ struct _ddebug *debug;
+ unsigned int num_debug;
+ bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
+extern int mod_verify_sig(const void *mod, struct load_info *info);
diff --git a/kernel/module.c b/kernel/module.c
index f475f30eed8c..6746c85511fe 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -307,24 +307,6 @@ int unregister_module_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_module_notifier);
-struct load_info {
- const char *name;
- Elf_Ehdr *hdr;
- unsigned long len;
- Elf_Shdr *sechdrs;
- char *secstrings, *strtab;
- unsigned long symoffs, stroffs;
- struct _ddebug *debug;
- unsigned int num_debug;
- bool sig_ok;
-#ifdef CONFIG_KALLSYMS
- unsigned long mod_kallsyms_init_off;
-#endif
- struct {
- unsigned int sym, str, mod, vers, info, pcpu;
- } index;
-};
-
/*
* We require a truly strong try_module_get(): 0 means success.
* Otherwise an error is returned due to ongoing or failed
@@ -547,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms,
return true;
}
+static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return offset_to_ptr(&sym->name_offset);
+#else
+ return sym->name;
+#endif
+}
+
static int cmp_name(const void *va, const void *vb)
{
const char *a;
const struct kernel_symbol *b;
a = va; b = vb;
- return strcmp(a, b->name);
+ return strcmp(a, kernel_symbol_name(b));
}
static bool find_symbol_in_section(const struct symsearch *syms,
@@ -1339,14 +1339,12 @@ static inline int check_modstruct_version(const struct load_info *info,
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
- &crc, true, false)) {
+ if (!find_symbol("module_layout", NULL, &crc, true, false)) {
preempt_enable();
BUG();
}
preempt_enable();
- return check_version(info, VMLINUX_SYMBOL_STR(module_layout),
- mod, crc);
+ return check_version(info, "module_layout", mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -2059,21 +2057,19 @@ static int copy_module_elf(struct module *mod, struct load_info *info)
/* Elf section header table */
size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
- mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL);
+ mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
if (mod->klp_info->sechdrs == NULL) {
ret = -ENOMEM;
goto free_info;
}
- memcpy(mod->klp_info->sechdrs, info->sechdrs, size);
/* Elf section name string table */
size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
- mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL);
+ mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
if (mod->klp_info->secstrings == NULL) {
ret = -ENOMEM;
goto free_sechdrs;
}
- memcpy(mod->klp_info->secstrings, info->secstrings, size);
/* Elf symbol section index */
symndx = info->index.sym;
@@ -2192,7 +2188,7 @@ void *__symbol_get(const char *symbol)
sym = NULL;
preempt_enable();
- return sym ? (void *)sym->value : NULL;
+ return sym ? (void *)kernel_symbol_value(sym) : NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -2222,10 +2218,12 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (find_symbol(s->name, &owner, NULL, true, false)) {
+ if (find_symbol(kernel_symbol_name(s), &owner, NULL,
+ true, false)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
- mod->name, s->name, module_name(owner));
+ mod->name, kernel_symbol_name(s),
+ module_name(owner));
return -ENOEXEC;
}
}
@@ -2274,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
ksym = resolve_symbol_wait(mod, info, name);
/* Ok if resolved. */
if (ksym && !IS_ERR(ksym)) {
- sym[i].st_value = ksym->value;
+ sym[i].st_value = kernel_symbol_value(ksym);
break;
}
@@ -2282,9 +2280,9 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
- pr_warn("%s: Unknown symbol %s (err %li)\n",
- mod->name, name, PTR_ERR(ksym));
ret = PTR_ERR(ksym) ?: -ENOENT;
+ pr_warn("%s: Unknown symbol %s (err %d)\n",
+ mod->name, name, ret);
break;
default:
@@ -2486,7 +2484,11 @@ static char *get_modinfo(struct load_info *info, const char *tag)
Elf_Shdr *infosec = &info->sechdrs[info->index.info];
unsigned long size = infosec->sh_size;
- for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
+ /*
+ * get_modinfo() calls made before rewrite_section_headers()
+ * must use sh_offset, as sh_addr isn't set!
+ */
+ for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) {
if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
return p + taglen + 1;
}
@@ -2534,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value,
ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
else
ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
- return ks != NULL && ks->value == value;
+ return ks != NULL && kernel_symbol_value(ks) == value;
}
/* As per nm */
@@ -2774,7 +2776,7 @@ static int module_sig_check(struct load_info *info, int flags)
memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
- err = mod_verify_sig(mod, &info->len);
+ err = mod_verify_sig(mod, info);
}
if (!err) {
@@ -2876,7 +2878,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
- err = security_kernel_read_file(NULL, READING_MODULE);
+ err = security_kernel_load_data(LOADING_MODULE);
if (err)
return err;
@@ -2926,17 +2928,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
}
/* Track but don't keep modinfo and version sections. */
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
- info->index.info = find_sec(info, ".modinfo");
- if (!info->index.info)
- info->name = "(missing .modinfo section)";
- else
- info->name = get_modinfo(info, "name");
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
return 0;
@@ -2947,23 +2939,24 @@ static int rewrite_section_headers(struct load_info *info, int flags)
* search for module section index etc), and do some basic section
* verification.
*
- * Return the temporary module pointer (we'll replace it with the final
- * one when we move the module sections around).
+ * Set info->mod to the temporary copy of the module in info->hdr. The final one
+ * will be allocated in move_module().
*/
-static struct module *setup_load_info(struct load_info *info, int flags)
+static int setup_load_info(struct load_info *info, int flags)
{
unsigned int i;
- int err;
- struct module *mod;
/* Set up the convenience variables */
info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
info->secstrings = (void *)info->hdr
+ info->sechdrs[info->hdr->e_shstrndx].sh_offset;
- err = rewrite_section_headers(info, flags);
- if (err)
- return ERR_PTR(err);
+ /* Try to find a name early so we can log errors with a module name */
+ info->index.info = find_sec(info, ".modinfo");
+ if (!info->index.info)
+ info->name = "(missing .modinfo section)";
+ else
+ info->name = get_modinfo(info, "name");
/* Find internal symbols and strings. */
for (i = 1; i < info->hdr->e_shnum; i++) {
@@ -2976,34 +2969,35 @@ static struct module *setup_load_info(struct load_info *info, int flags)
}
}
+ if (info->index.sym == 0) {
+ pr_warn("%s: module has no symbols (stripped?)\n", info->name);
+ return -ENOEXEC;
+ }
+
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
pr_warn("%s: No module found in object\n",
info->name ?: "(missing .modinfo name field)");
- return ERR_PTR(-ENOEXEC);
+ return -ENOEXEC;
}
/* This is temporary: point mod into copy of data. */
- mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
/*
- * If we didn't load the .modinfo 'name' field, fall back to
+ * If we didn't load the .modinfo 'name' field earlier, fall back to
* on-disk struct mod 'name' field.
*/
if (!info->name)
- info->name = mod->name;
+ info->name = info->mod->name;
- if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", info->name);
- return ERR_PTR(-ENOEXEC);
- }
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
info->index.pcpu = find_pcpusec(info);
- /* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info, mod))
- return ERR_PTR(-ENOEXEC);
-
- return mod;
+ return 0;
}
static int check_modinfo(struct module *mod, struct load_info *info, int flags)
@@ -3298,25 +3292,17 @@ core_param(module_blacklist, module_blacklist, charp, 0400);
static struct module *layout_and_allocate(struct load_info *info, int flags)
{
- /* Module within temporary copy. */
struct module *mod;
unsigned int ndx;
int err;
- mod = setup_load_info(info, flags);
- if (IS_ERR(mod))
- return mod;
-
- if (blacklisted(info->name))
- return ERR_PTR(-EPERM);
-
- err = check_modinfo(mod, info, flags);
+ err = check_modinfo(info->mod, info, flags);
if (err)
return ERR_PTR(err);
/* Allow arches to frob section contents and sizes. */
err = module_frob_arch_sections(info->hdr, info->sechdrs,
- info->secstrings, mod);
+ info->secstrings, info->mod);
if (err < 0)
return ERR_PTR(err);
@@ -3335,11 +3321,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
special cases for the architectures. */
- layout_sections(mod, info);
- layout_symtab(mod, info);
+ layout_sections(info->mod, info);
+ layout_symtab(info->mod, info);
/* Allocate and move to the final place */
- err = move_module(mod, info);
+ err = move_module(info->mod, info);
if (err)
return ERR_PTR(err);
@@ -3657,17 +3643,36 @@ static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
- long err;
+ long err = 0;
char *after_dashes;
+ err = elf_header_check(info);
+ if (err)
+ goto free_copy;
+
+ err = setup_load_info(info, flags);
+ if (err)
+ goto free_copy;
+
+ if (blacklisted(info->name)) {
+ err = -EPERM;
+ goto free_copy;
+ }
+
err = module_sig_check(info, flags);
if (err)
goto free_copy;
- err = elf_header_check(info);
+ err = rewrite_section_headers(info, flags);
if (err)
goto free_copy;
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info, info->mod)) {
+ err = -ENOEXEC;
+ goto free_copy;
+ }
+
/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
@@ -4067,7 +4072,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name)
for (i = 0; i < kallsyms->num_symtab; i++)
if (strcmp(name, symname(kallsyms, i)) == 0 &&
- kallsyms->symtab[i].st_info != 'U')
+ kallsyms->symtab[i].st_shndx != SHN_UNDEF)
return kallsyms->symtab[i].st_value;
return 0;
}
@@ -4113,6 +4118,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (mod->state == MODULE_STATE_UNFORMED)
continue;
for (i = 0; i < kallsyms->num_symtab; i++) {
+
+ if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+ continue;
+
ret = fn(data, symname(kallsyms, i),
mod, kallsyms->symtab[i].st_value);
if (ret != 0)
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 937c844bee4a..f2075ce8e4b3 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -45,10 +45,10 @@ struct module_signature {
/*
* Verify the signature on a module.
*/
-int mod_verify_sig(const void *mod, unsigned long *_modlen)
+int mod_verify_sig(const void *mod, struct load_info *info)
{
struct module_signature ms;
- size_t modlen = *_modlen, sig_len;
+ size_t sig_len, modlen = info->len;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -62,10 +62,11 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
if (sig_len >= modlen)
return -EBADMSG;
modlen -= sig_len;
- *_modlen = modlen;
+ info->len = modlen;
if (ms.id_type != PKEY_ID_PKCS7) {
- pr_err("Module is not signed with expected PKCS#7 message\n");
+ pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+ info->name);
return -ENOPKG;
}
@@ -76,7 +77,8 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
ms.__pad[0] != 0 ||
ms.__pad[1] != 0 ||
ms.__pad[2] != 0) {
- pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+ pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
+ info->name);
return -EBADMSG;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 157fe4b19971..de1cfc4f75a2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -265,27 +265,33 @@ struct pid *find_vpid(int nr)
}
EXPORT_SYMBOL_GPL(find_vpid);
+static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
+{
+ return (type == PIDTYPE_PID) ?
+ &task->thread_pid :
+ &task->signal->pids[type];
+}
+
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid_link *link = &task->pids[type];
- hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
+ struct pid *pid = *task_pid_ptr(task, type);
+ hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
static void __change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
{
- struct pid_link *link;
+ struct pid **pid_ptr = task_pid_ptr(task, type);
struct pid *pid;
int tmp;
- link = &task->pids[type];
- pid = link->pid;
+ pid = *pid_ptr;
- hlist_del_rcu(&link->node);
- link->pid = new;
+ hlist_del_rcu(&task->pid_links[type]);
+ *pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (!hlist_empty(&pid->tasks[tmp]))
@@ -310,8 +316,9 @@ void change_pid(struct task_struct *task, enum pid_type type,
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
- new->pids[type].pid = old->pids[type].pid;
- hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+ if (type == PIDTYPE_PID)
+ new->thread_pid = old->thread_pid;
+ hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -322,7 +329,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
lockdep_tasklist_lock_is_held());
if (first)
- result = hlist_entry(first, struct task_struct, pids[(type)].node);
+ result = hlist_entry(first, struct task_struct, pid_links[(type)]);
}
return result;
}
@@ -360,9 +367,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
- if (type != PIDTYPE_PID)
- task = task->group_leader;
- pid = get_pid(rcu_dereference(task->pids[type].pid));
+ pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
rcu_read_unlock();
return pid;
}
@@ -420,15 +425,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
- if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID) {
- if (type == __PIDTYPE_TGID)
- type = PIDTYPE_PID;
-
- task = task->group_leader;
- }
- nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
- }
+ if (likely(pid_alive(task)))
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e880ca22c5a5..3a6c2f87699e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -105,6 +105,7 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
+ select SRCU
config PM_SLEEP_SMP
def_bool y
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 9c85c7822383..abef759de7c8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -338,7 +338,7 @@ static int create_image(int platform_mode)
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
*
- * This routine must be called with pm_mutex held.
+ * This routine must be called with system_transition_mutex held.
*/
int hibernation_snapshot(int platform_mode)
{
@@ -500,8 +500,9 @@ static int resume_target_kernel(bool platform_mode)
* hibernation_restore - Quiesce devices and restore from a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
*
- * This routine must be called with pm_mutex held. If it is successful, control
- * reappears in the restored target kernel in hibernation_snapshot().
+ * This routine must be called with system_transition_mutex held. If it is
+ * successful, control reappears in the restored target kernel in
+ * hibernation_snapshot().
*/
int hibernation_restore(int platform_mode)
{
@@ -638,6 +639,7 @@ static void power_down(void)
break;
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
+ /* Fall through */
case HIBERNATION_SHUTDOWN:
if (pm_power_off)
kernel_power_off();
@@ -805,13 +807,13 @@ static int software_resume(void)
* name_to_dev_t() below takes a sysfs buffer mutex when sysfs
* is configured into the kernel. Since the regular hibernate
* trigger path is via sysfs which takes a buffer mutex before
- * calling hibernate functions (which take pm_mutex) this can
- * cause lockdep to complain about a possible ABBA deadlock
+ * calling hibernate functions (which take system_transition_mutex)
+ * this can cause lockdep to complain about a possible ABBA deadlock
* which cannot happen since we're in the boot code here and
* sysfs can't be invoked yet. Therefore, we use a subclass
* here to avoid lockdep complaining.
*/
- mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+ mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
if (swsusp_resume_device)
goto Check_image;
@@ -899,7 +901,7 @@ static int software_resume(void)
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
- mutex_unlock(&pm_mutex);
+ mutex_unlock(&system_transition_mutex);
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d9706da10930..35b50823d83b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,17 +15,16 @@
#include <linux/workqueue.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include "power.h"
-DEFINE_MUTEX(pm_mutex);
-
#ifdef CONFIG_PM_SLEEP
void lock_system_sleep(void)
{
current->flags |= PF_FREEZER_SKIP;
- mutex_lock(&pm_mutex);
+ mutex_lock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(lock_system_sleep);
@@ -37,8 +36,9 @@ void unlock_system_sleep(void)
*
* Reason:
* Fundamentally, we just don't need it, because freezing condition
- * doesn't come into effect until we release the pm_mutex lock,
- * since the freezer always works with pm_mutex held.
+ * doesn't come into effect until we release the
+ * system_transition_mutex lock, since the freezer always works with
+ * system_transition_mutex held.
*
* More importantly, in the case of hibernation,
* unlock_system_sleep() gets called in snapshot_read() and
@@ -47,7 +47,7 @@ void unlock_system_sleep(void)
* enter the refrigerator, thus causing hibernation to lockup.
*/
current->flags &= ~PF_FREEZER_SKIP;
- mutex_unlock(&pm_mutex);
+ mutex_unlock(&system_transition_mutex);
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 70178f6ffdc4..5342f6fc022e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -556,7 +556,7 @@ static int enter_state(suspend_state_t state)
} else if (!valid_state(state)) {
return -EINVAL;
}
- if (!mutex_trylock(&pm_mutex))
+ if (!mutex_trylock(&system_transition_mutex))
return -EBUSY;
if (state == PM_SUSPEND_TO_IDLE)
@@ -590,7 +590,7 @@ static int enter_state(suspend_state_t state)
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
- mutex_unlock(&pm_mutex);
+ mutex_unlock(&system_transition_mutex);
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index c2bcf97d24c8..d7f6c1a288d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -923,7 +923,7 @@ int swsusp_write(unsigned int flags)
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot);
- if (error < PAGE_SIZE) {
+ if (error < (int)PAGE_SIZE) {
if (error >= 0)
error = -EFAULT;
@@ -1483,7 +1483,7 @@ int swsusp_read(unsigned int *flags_p)
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_write_next(&snapshot);
- if (error < PAGE_SIZE)
+ if (error < (int)PAGE_SIZE)
return error < 0 ? error : -EFAULT;
header = (struct swsusp_info *)data_of(snapshot);
error = get_swap_reader(&handle, flags_p);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index abd225550271..2d8b60a3c86b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -216,7 +216,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!mutex_trylock(&pm_mutex))
+ if (!mutex_trylock(&system_transition_mutex))
return -EBUSY;
lock_device_hotplug();
@@ -394,7 +394,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
unlock_device_hotplug();
- mutex_unlock(&pm_mutex);
+ mutex_unlock(&system_transition_mutex);
return error;
}
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a7d04049af4..0f1898820cba 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -19,11 +19,16 @@
#ifdef CONFIG_PRINTK
#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
-#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
+#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000
#define PRINTK_NMI_CONTEXT_MASK 0x80000000
extern raw_spinlock_t logbuf_lock;
+__printf(5, 0)
+int vprintk_store(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args);
+
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
@@ -54,6 +59,8 @@ void __printk_safe_exit(void);
local_irq_enable(); \
} while (0)
+void defer_console_output(void);
+
#else
__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 247808333ba4..924e37fb1620 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -66,6 +66,9 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
+atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
+EXPORT_SYMBOL(ignore_console_lock_warning);
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -349,7 +352,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
*/
enum log_flags {
- LOG_NOCONS = 1, /* already flushed, do not print to console */
+ LOG_NOCONS = 1, /* suppress print, do not print to console */
LOG_NEWLINE = 2, /* text ended with a newline */
LOG_PREFIX = 4, /* text started with a prefix */
LOG_CONT = 8, /* text is a fragment of a continuation line */
@@ -1352,71 +1355,68 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
{
char *text;
int len = 0;
+ u64 next_seq;
+ u64 seq;
+ u32 idx;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
logbuf_lock_irq();
- if (buf) {
- u64 next_seq;
- u64 seq;
- u32 idx;
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+ seq = clear_seq;
+ idx = clear_idx;
+ while (seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
- /*
- * Find first record that fits, including all following records,
- * into the user-provided buffer for this dump.
- */
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ len += msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
- /* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /* move first record forward until length fits into the buffer */
+ seq = clear_seq;
+ idx = clear_idx;
+ while (len > size && seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
- len -= msg_print_text(msg, true, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ len -= msg_print_text(msg, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ }
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ /* last message fitting into this dump */
+ next_seq = log_next_seq;
- len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen;
+ len = 0;
+ while (len >= 0 && seq < next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+ int textlen;
- textlen = msg_print_text(msg, true, text,
- LOG_LINE_MAX + PREFIX_MAX);
- if (textlen < 0) {
- len = textlen;
- break;
- }
- idx = log_next(idx);
- seq++;
+ textlen = msg_print_text(msg, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
+ idx = log_next(idx);
+ seq++;
- logbuf_unlock_irq();
- if (copy_to_user(buf + len, text, textlen))
- len = -EFAULT;
- else
- len += textlen;
- logbuf_lock_irq();
-
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
+ logbuf_unlock_irq();
+ if (copy_to_user(buf + len, text, textlen))
+ len = -EFAULT;
+ else
+ len += textlen;
+ logbuf_lock_irq();
+
+ if (seq < log_first_seq) {
+ /* messages are gone, move to next one */
+ seq = log_first_seq;
+ idx = log_first_idx;
}
}
@@ -1430,6 +1430,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return len;
}
+static void syslog_clear(void)
+{
+ logbuf_lock_irq();
+ clear_seq = log_next_seq;
+ clear_idx = log_next_idx;
+ logbuf_unlock_irq();
+}
+
int do_syslog(int type, char __user *buf, int len, int source)
{
bool clear = false;
@@ -1474,7 +1482,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
- syslog_print_all(NULL, 0, true);
+ syslog_clear();
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1824,28 +1832,16 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
}
-asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args)
+/* Must be called under logbuf_lock. */
+int vprintk_store(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
{
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len;
enum log_flags lflags = 0;
- unsigned long flags;
- int printed_len;
- bool in_sched = false;
-
- if (level == LOGLEVEL_SCHED) {
- level = LOGLEVEL_DEFAULT;
- in_sched = true;
- }
- boot_delay_msec(level);
- printk_delay();
-
- /* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
@@ -1886,8 +1882,32 @@ asmlinkage int vprintk_emit(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len);
+ if (suppress_message_printing(level))
+ lflags |= LOG_NOCONS;
+ return log_output(facility, level, lflags,
+ dict, dictlen, text, text_len);
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
+{
+ int printed_len;
+ bool in_sched = false;
+ unsigned long flags;
+
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
+ in_sched = true;
+ }
+
+ boot_delay_msec(level);
+ printk_delay();
+
+ /* This stops the holder of console_sem just where we want him */
+ logbuf_lock_irqsave(flags);
+ printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
logbuf_unlock_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
@@ -2013,7 +2033,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
@@ -2243,6 +2262,7 @@ int is_console_locked(void)
{
return console_locked;
}
+EXPORT_SYMBOL(is_console_locked);
/*
* Check if we have any console that is capable of printing while cpu is
@@ -2349,11 +2369,10 @@ skip:
break;
msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
+ if (msg->flags & LOG_NOCONS) {
/*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
+ * Skip record if !ignore_loglevel, and
+ * record has level above the console loglevel.
*/
console_idx = log_next(console_idx);
console_seq++;
@@ -2772,7 +2791,8 @@ EXPORT_SYMBOL(unregister_console);
void __init console_init(void)
{
int ret;
- initcall_t *call;
+ initcall_t call;
+ initcall_entry_t *ce;
/* Setup the default TTY line discipline. */
n_tty_init();
@@ -2781,13 +2801,14 @@ void __init console_init(void)
* set up the console device so that later boot sequences can
* inform about problems etc..
*/
- call = __con_initcall_start;
+ ce = __con_initcall_start;
trace_initcall_level("console");
- while (call < __con_initcall_end) {
- trace_initcall_start((*call));
- ret = (*call)();
- trace_initcall_finish((*call), ret);
- call++;
+ while (ce < __con_initcall_end) {
+ call = initcall_from_entry(ce);
+ trace_initcall_start(call);
+ ret = call();
+ trace_initcall_finish(call, ret);
+ ce++;
}
}
@@ -2878,16 +2899,20 @@ void wake_up_klogd(void)
preempt_enable();
}
-int vprintk_deferred(const char *fmt, va_list args)
+void defer_console_output(void)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
-
preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
+}
+
+int vprintk_deferred(const char *fmt, va_list args)
+{
+ int r;
+
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+ defer_console_output();
return r;
}
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index d7d091309054..a0a74c533e4b 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -308,24 +308,33 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
void printk_nmi_enter(void)
{
- /*
- * The size of the extra per-CPU buffer is limited. Use it only when
- * the main one is locked. If this CPU is not in the safe context,
- * the lock must be taken on another CPU and we could wait for it.
- */
- if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
- raw_spin_is_locked(&logbuf_lock)) {
- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
- } else {
- this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
- }
+ this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
}
void printk_nmi_exit(void)
{
- this_cpu_and(printk_context,
- ~(PRINTK_NMI_CONTEXT_MASK |
- PRINTK_NMI_DEFERRED_CONTEXT_MASK));
+ this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+}
+
+/*
+ * Marks a code that might produce many messages in NMI context
+ * and the risk of losing them is more critical than eventual
+ * reordering.
+ *
+ * It has effect only when called in NMI context. Then printk()
+ * will try to store the messages into the main logbuf directly
+ * and use the per-CPU buffers only as a fallback when the lock
+ * is not available.
+ */
+void printk_nmi_direct_enter(void)
+{
+ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
+ this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
+}
+
+void printk_nmi_direct_exit(void)
+{
+ this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
}
#else
@@ -363,6 +372,20 @@ void __printk_safe_exit(void)
__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
+ /*
+ * Try to use the main logbuf even in NMI. But avoid calling console
+ * drivers that might have their own locks.
+ */
+ if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
+ raw_spin_trylock(&logbuf_lock)) {
+ int len;
+
+ len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ raw_spin_unlock(&logbuf_lock);
+ defer_console_output();
+ return len;
+ }
+
/* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
@@ -371,13 +394,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
return vprintk_safe(fmt, args);
- /*
- * Use the main logbuf when logbuf_lock is available in NMI.
- * But avoid calling console drivers that might have their own locks.
- */
- if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
- return vprintk_deferred(fmt, args);
-
/* No obstacles. */
return vprintk_default(fmt, args);
}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e4ced883d8de..8fb44dec9ad7 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -294,7 +294,7 @@ void kernel_power_off(void)
}
EXPORT_SYMBOL_GPL(kernel_power_off);
-static DEFINE_MUTEX(reboot_mutex);
+DEFINE_MUTEX(system_transition_mutex);
/*
* Reboot system call: for obvious reasons only root may call it,
@@ -338,7 +338,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
cmd = LINUX_REBOOT_CMD_HALT;
- mutex_lock(&reboot_mutex);
+ mutex_lock(&system_transition_mutex);
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
kernel_restart(NULL);
@@ -389,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
ret = -EINVAL;
break;
}
- mutex_unlock(&reboot_mutex);
+ mutex_unlock(&system_transition_mutex);
return ret;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c45de46fdf10..625bc9897f62 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2774,6 +2774,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
+
+ calculate_sigpending();
}
/*
@@ -3159,7 +3161,7 @@ static inline void sched_tick_stop(int cpu) { }
#endif
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_PREEMPT_TRACER))
+ defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* If the value passed in is equal to the current preempt count
* then we just disabled preemption. Start timing the latency.
@@ -5737,6 +5739,18 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * The sched_smt_present static key needs to be evaluated on every
+ * hotplug event because at boot time SMT might be disabled when
+ * the number of booted CPUs is limited.
+ *
+ * If then later a sibling gets hotplugged, then the key would stay
+ * off and SMT scheduling would never be functional.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
+ static_branch_enable_cpuslocked(&sched_smt_present);
+#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -5833,22 +5847,6 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
-#ifdef CONFIG_SCHED_SMT
-DEFINE_STATIC_KEY_FALSE(sched_smt_present);
-
-static void sched_init_smt(void)
-{
- /*
- * We've enumerated all CPUs and will assume that if any CPU
- * has SMT siblings, CPU0 will too.
- */
- if (cpumask_weight(cpu_smt_mask(0)) > 1)
- static_branch_enable(&sched_smt_present);
-}
-#else
-static inline void sched_init_smt(void) { }
-#endif
-
void __init sched_init_smp(void)
{
sched_init_numa();
@@ -5870,8 +5868,6 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
- sched_init_smt();
-
sched_smp_initialized = true;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 309c93fcc604..b39fb596f6c1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5839,6 +5839,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
}
#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1a3e9bddd17b..16f84142f2f4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -190,7 +190,7 @@ static void cpuidle_idle_call(void)
*/
next_state = cpuidle_select(drv, dev, &stop_tick);
- if (stop_tick)
+ if (stop_tick || tick_nohz_tick_stopped())
tick_nohz_idle_stop_tick();
else
tick_nohz_idle_retain_tick();
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 870f97b313e3..5dd47f1103d1 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
wait_queue_entry_t *curr, *next;
int cnt = 0;
+ lockdep_assert_held(&wq_head->lock);
+
if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
curr = list_next_entry(bookmark, entry);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8d8a940422a8..5843c541fda9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -65,14 +65,14 @@ static void __user *sig_handler(struct task_struct *t, int sig)
return t->sighand->action[sig - 1].sa.sa_handler;
}
-static int sig_handler_ignored(void __user *handler, int sig)
+static inline bool sig_handler_ignored(void __user *handler, int sig)
{
/* Is it explicitly or implicitly ignored? */
return handler == SIG_IGN ||
- (handler == SIG_DFL && sig_kernel_ignore(sig));
+ (handler == SIG_DFL && sig_kernel_ignore(sig));
}
-static int sig_task_ignored(struct task_struct *t, int sig, bool force)
+static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
void __user *handler;
@@ -80,12 +80,12 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
handler == SIG_DFL && !(force && sig_kernel_only(sig)))
- return 1;
+ return true;
return sig_handler_ignored(handler, sig);
}
-static int sig_ignored(struct task_struct *t, int sig, bool force)
+static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
/*
* Blocked signals are never ignored, since the
@@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
* unblocked.
*/
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
- return 0;
+ return false;
/*
* Tracers may want to know about even ignored signal unless it
@@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
* by SIGNAL_UNKILLABLE task.
*/
if (t->ptrace && sig != SIGKILL)
- return 0;
+ return false;
return sig_task_ignored(t, sig, force);
}
@@ -110,7 +110,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
* Re-calculate pending state from the set of locally pending
* signals, globally pending signals, and blocked signals.
*/
-static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
unsigned long ready;
long i;
@@ -138,20 +138,21 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
-static int recalc_sigpending_tsk(struct task_struct *t)
+static bool recalc_sigpending_tsk(struct task_struct *t)
{
if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
- return 1;
+ return true;
}
+
/*
* We must never clear the flag in another thread, or in current
* when it's possible the current syscall is returning -ERESTART*.
* So we don't clear it here, and only callers who know they should do.
*/
- return 0;
+ return false;
}
/*
@@ -172,6 +173,17 @@ void recalc_sigpending(void)
}
+void calculate_sigpending(void)
+{
+ /* Have any signals or users of TIF_SIGPENDING been delayed
+ * until after fork?
+ */
+ spin_lock_irq(&current->sighand->siglock);
+ set_tsk_thread_flag(current, TIF_SIGPENDING);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
/* Given the mask, find the first available signal that should be serviced. */
#define SYNCHRONOUS_MASK \
@@ -362,6 +374,20 @@ static bool task_participate_group_stop(struct task_struct *task)
return false;
}
+void task_join_group_stop(struct task_struct *task)
+{
+ /* Have the new thread join an on-going signal group stop */
+ unsigned long jobctl = current->jobctl;
+ if (jobctl & JOBCTL_STOP_PENDING) {
+ struct signal_struct *sig = current->signal;
+ unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK;
+ unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+ if (task_set_jobctl_pending(task, signr | gstop)) {
+ sig->group_stop_count++;
+ }
+ }
+}
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
@@ -504,13 +530,15 @@ flush_signal_handlers(struct task_struct *t, int force_default)
}
}
-int unhandled_signal(struct task_struct *tsk, int sig)
+bool unhandled_signal(struct task_struct *tsk, int sig)
{
void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
if (is_global_init(tsk))
- return 1;
+ return true;
+
if (handler != SIG_IGN && handler != SIG_DFL)
- return 0;
+ return false;
+
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
@@ -684,14 +712,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
*
* All callers must be holding the siglock.
*/
-static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
+static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
sigandsets(&m, mask, &s->signal);
if (sigisemptyset(&m))
- return 0;
+ return;
sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
@@ -700,7 +728,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
__sigqueue_free(q);
}
}
- return 1;
}
static inline int is_si_special(const struct siginfo *info)
@@ -717,21 +744,16 @@ static inline bool si_fromuser(const struct siginfo *info)
/*
* called with RCU read lock from check_kill_permission()
*/
-static int kill_ok_by_cred(struct task_struct *t)
+static bool kill_ok_by_cred(struct task_struct *t)
{
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
- if (uid_eq(cred->euid, tcred->suid) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->uid, tcred->suid) ||
- uid_eq(cred->uid, tcred->uid))
- return 1;
-
- if (ns_capable(tcred->user_ns, CAP_KILL))
- return 1;
-
- return 0;
+ return uid_eq(cred->euid, tcred->suid) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->uid, tcred->suid) ||
+ uid_eq(cred->uid, tcred->uid) ||
+ ns_capable(tcred->user_ns, CAP_KILL);
}
/*
@@ -882,20 +904,24 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* as soon as they're available, so putting the signal on the shared queue
* will be equivalent to sending it to one such thread.
*/
-static inline int wants_signal(int sig, struct task_struct *p)
+static inline bool wants_signal(int sig, struct task_struct *p)
{
if (sigismember(&p->blocked, sig))
- return 0;
+ return false;
+
if (p->flags & PF_EXITING)
- return 0;
+ return false;
+
if (sig == SIGKILL)
- return 1;
+ return true;
+
if (task_is_stopped_or_traced(p))
- return 0;
+ return false;
+
return task_curr(p) || !signal_pending(p);
}
-static void complete_signal(int sig, struct task_struct *p, int group)
+static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
@@ -908,7 +934,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
*/
if (wants_signal(sig, p))
t = p;
- else if (!group || thread_group_empty(p))
+ else if ((type == PIDTYPE_PID) || thread_group_empty(p))
/*
* There is just one thread and it does not need to be woken.
* It will dequeue unblocked signals before it runs again.
@@ -971,7 +997,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
return;
}
-static inline int legacy_queue(struct sigpending *signals, int sig)
+static inline bool legacy_queue(struct sigpending *signals, int sig)
{
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
@@ -998,7 +1024,7 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
#endif
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
- int group, int from_ancestor_ns)
+ enum pid_type type, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
@@ -1012,7 +1038,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
from_ancestor_ns || (info == SEND_SIG_FORCED)))
goto ret;
- pending = group ? &t->signal->shared_pending : &t->pending;
+ pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
/*
* Short-circuit ignored signals and support queuing
* exactly one non-rt signal, so that we can get more
@@ -1096,14 +1122,29 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
out_set:
signalfd_notify(t, sig);
sigaddset(&pending->signal, sig);
- complete_signal(sig, t, group);
+
+ /* Let multiprocess signals appear after on-going forks */
+ if (type > PIDTYPE_TGID) {
+ struct multiprocess_signals *delayed;
+ hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
+ sigset_t *signal = &delayed->signal;
+ /* Can't queue both a stop and a continue signal */
+ if (sig == SIGCONT)
+ sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
+ else if (sig_kernel_stop(sig))
+ sigdelset(signal, SIGCONT);
+ sigaddset(signal, sig);
+ }
+ }
+
+ complete_signal(sig, t, type);
ret:
- trace_signal_generate(sig, info, t, group, result);
+ trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
return ret;
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
- int group)
+ enum pid_type type)
{
int from_ancestor_ns = 0;
@@ -1112,7 +1153,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
!task_pid_nr_ns(current, task_active_pid_ns(t));
#endif
- return __send_signal(sig, info, t, group, from_ancestor_ns);
+ return __send_signal(sig, info, t, type, from_ancestor_ns);
}
static void print_fatal_signal(int signr)
@@ -1151,23 +1192,23 @@ __setup("print-fatal-signals=", setup_print_fatal_signals);
int
__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
- return send_signal(sig, info, p, 1);
+ return send_signal(sig, info, p, PIDTYPE_TGID);
}
static int
specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
- return send_signal(sig, info, t, 0);
+ return send_signal(sig, info, t, PIDTYPE_PID);
}
int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
- bool group)
+ enum pid_type type)
{
unsigned long flags;
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
- ret = send_signal(sig, info, p, group);
+ ret = send_signal(sig, info, p, type);
unlock_task_sighand(p, &flags);
}
@@ -1274,7 +1315,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
/*
* send signal info to all the members of a group
*/
-int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+ enum pid_type type)
{
int ret;
@@ -1283,7 +1325,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
rcu_read_unlock();
if (!ret && sig)
- ret = do_send_sig_info(sig, info, p, true);
+ ret = do_send_sig_info(sig, info, p, type);
return ret;
}
@@ -1301,7 +1343,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
success = 0;
retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- int err = group_send_sig_info(sig, info, p);
+ int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
success |= !err;
retval = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
@@ -1317,7 +1359,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
- error = group_send_sig_info(sig, info, p);
+ error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
@@ -1339,14 +1381,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
return error;
}
-static int kill_as_cred_perm(const struct cred *cred,
- struct task_struct *target)
+static inline bool kill_as_cred_perm(const struct cred *cred,
+ struct task_struct *target)
{
const struct cred *pcred = __task_cred(target);
- if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
- !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
- return 0;
- return 1;
+
+ return uid_eq(cred->euid, pcred->suid) ||
+ uid_eq(cred->euid, pcred->uid) ||
+ uid_eq(cred->uid, pcred->suid) ||
+ uid_eq(cred->uid, pcred->uid);
}
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
@@ -1376,7 +1419,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
if (sig) {
if (lock_task_sighand(p, &flags)) {
- ret = __send_signal(sig, info, p, 1, 0);
+ ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
@@ -1420,7 +1463,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
for_each_process(p) {
if (task_pid_vnr(p) > 1 &&
!same_thread_group(p, current)) {
- int err = group_send_sig_info(sig, info, p);
+ int err = group_send_sig_info(sig, info, p,
+ PIDTYPE_MAX);
++count;
if (err != -EPERM)
retval = err;
@@ -1446,7 +1490,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
if (!valid_signal(sig))
return -EINVAL;
- return do_send_sig_info(sig, info, p, false);
+ return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
#define __si_special(priv) \
@@ -1458,8 +1502,7 @@ send_sig(int sig, struct task_struct *p, int priv)
return send_sig_info(sig, __si_special(priv), p);
}
-void
-force_sig(int sig, struct task_struct *p)
+void force_sig(int sig, struct task_struct *p)
{
force_sig_info(sig, SEND_SIG_PRIV, p);
}
@@ -1470,8 +1513,7 @@ force_sig(int sig, struct task_struct *p)
* the problem was already a SIGSEGV, we'll want to
* make sure we don't even try to deliver the signal..
*/
-int
-force_sigsegv(int sig, struct task_struct *p)
+void force_sigsegv(int sig, struct task_struct *p)
{
if (sig == SIGSEGV) {
unsigned long flags;
@@ -1480,7 +1522,6 @@ force_sigsegv(int sig, struct task_struct *p)
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
force_sig(SIGSEGV, p);
- return 0;
}
int force_sig_fault(int sig, int code, void __user *addr
@@ -1664,17 +1705,20 @@ void sigqueue_free(struct sigqueue *q)
__sigqueue_free(q);
}
-int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
+int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
{
int sig = q->info.si_signo;
struct sigpending *pending;
+ struct task_struct *t;
unsigned long flags;
int ret, result;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
ret = -1;
- if (!likely(lock_task_sighand(t, &flags)))
+ rcu_read_lock();
+ t = pid_task(pid, type);
+ if (!t || !likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
@@ -1696,15 +1740,16 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
q->info.si_overrun = 0;
signalfd_notify(t, sig);
- pending = group ? &t->signal->shared_pending : &t->pending;
+ pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
list_add_tail(&q->list, &pending->list);
sigaddset(&pending->signal, sig);
- complete_signal(sig, t, group);
+ complete_signal(sig, t, type);
result = TRACE_SIGNAL_DELIVERED;
out:
- trace_signal_generate(sig, &q->info, t, group, result);
+ trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
unlock_task_sighand(t, &flags);
ret:
+ rcu_read_unlock();
return ret;
}
@@ -1877,10 +1922,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_unlock_irqrestore(&sighand->siglock, flags);
}
-static inline int may_ptrace_stop(void)
+static inline bool may_ptrace_stop(void)
{
if (!likely(current->ptrace))
- return 0;
+ return false;
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
@@ -1896,19 +1941,19 @@ static inline int may_ptrace_stop(void)
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
* Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
-static int sigkill_pending(struct task_struct *tsk)
+static bool sigkill_pending(struct task_struct *tsk)
{
- return sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
+ return sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
}
/*
@@ -2288,7 +2333,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
return signr;
}
-int get_signal(struct ksignal *ksig)
+bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
@@ -2298,7 +2343,7 @@ int get_signal(struct ksignal *ksig)
task_work_run();
if (unlikely(uprobe_deny_signal()))
- return 0;
+ return false;
/*
* Do this once, we can't return to user-mode if freezing() == T.
@@ -2755,7 +2800,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
}
#endif
-static int do_sigpending(sigset_t *set)
+static void do_sigpending(sigset_t *set)
{
spin_lock_irq(&current->sighand->siglock);
sigorsets(set, &current->pending.signal,
@@ -2764,7 +2809,6 @@ static int do_sigpending(sigset_t *set)
/* Outside the lock because only this thread touches it. */
sigandsets(set, &current->blocked, set);
- return 0;
}
/**
@@ -2776,15 +2820,16 @@ static int do_sigpending(sigset_t *set)
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
- int err;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err && copy_to_user(uset, &set, sigsetsize))
- err = -EFAULT;
- return err;
+ do_sigpending(&set);
+
+ if (copy_to_user(uset, &set, sigsetsize))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -2792,15 +2837,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
compat_size_t, sigsetsize)
{
sigset_t set;
- int err;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err)
- err = put_compat_sigset(uset, &set, sigsetsize);
- return err;
+ do_sigpending(&set);
+
+ return put_compat_sigset(uset, &set, sigsetsize);
}
#endif
@@ -3193,7 +3236,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
* probe. No signal is actually delivered.
*/
if (!error && sig) {
- error = do_send_sig_info(sig, info, p, false);
+ error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
/*
* If lock_task_sighand() failed we pretend the task
* dies after receiving the signal. The window is tiny,
@@ -3562,25 +3605,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
sigset_t set;
- int err;
if (sizeof(old_sigset_t) > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
- err = -EFAULT;
- return err;
+ do_sigpending(&set);
+
+ if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
sigset_t set;
- int err = do_sigpending(&set);
- if (!err)
- err = put_user(set.sig[0], set32);
- return err;
+
+ do_sigpending(&set);
+
+ return put_user(set.sig[0], set32);
}
#endif
@@ -3651,25 +3695,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
size_t, sigsetsize)
{
struct k_sigaction new_sa, old_sa;
- int ret = -EINVAL;
+ int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
- goto out;
+ return -EINVAL;
- if (act) {
- if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
- return -EFAULT;
- }
+ if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+ return -EFAULT;
ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+ if (ret)
+ return ret;
- if (!ret && oact) {
- if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
- return -EFAULT;
- }
-out:
- return ret;
+ if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
@@ -3960,7 +4002,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
"the deadlock.\n");
return;
}
- ret = send_signal(sig, SEND_SIG_PRIV, t, false);
+ ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
spin_unlock(&t->sighand->siglock);
if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..d86eec5f51c1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,6 +584,8 @@ void __init smp_init(void)
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
+ /* Final decision about SMT support */
+ cpu_smt_check_topology();
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index e27b51d3facd..cf5c67533ff1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1237,18 +1237,19 @@ static int override_release(char __user *release, size_t len)
SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
{
- int errno = 0;
+ struct new_utsname tmp;
down_read(&uts_sem);
- if (copy_to_user(name, utsname(), sizeof *name))
- errno = -EFAULT;
+ memcpy(&tmp, utsname(), sizeof(tmp));
up_read(&uts_sem);
+ if (copy_to_user(name, &tmp, sizeof(tmp)))
+ return -EFAULT;
- if (!errno && override_release(name->release, sizeof(name->release)))
- errno = -EFAULT;
- if (!errno && override_architecture(name))
- errno = -EFAULT;
- return errno;
+ if (override_release(name->release, sizeof(name->release)))
+ return -EFAULT;
+ if (override_architecture(name))
+ return -EFAULT;
+ return 0;
}
#ifdef __ARCH_WANT_SYS_OLD_UNAME
@@ -1257,55 +1258,46 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
*/
SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
{
- int error = 0;
+ struct old_utsname tmp;
if (!name)
return -EFAULT;
down_read(&uts_sem);
- if (copy_to_user(name, utsname(), sizeof(*name)))
- error = -EFAULT;
+ memcpy(&tmp, utsname(), sizeof(tmp));
up_read(&uts_sem);
+ if (copy_to_user(name, &tmp, sizeof(tmp)))
+ return -EFAULT;
- if (!error && override_release(name->release, sizeof(name->release)))
- error = -EFAULT;
- if (!error && override_architecture(name))
- error = -EFAULT;
- return error;
+ if (override_release(name->release, sizeof(name->release)))
+ return -EFAULT;
+ if (override_architecture(name))
+ return -EFAULT;
+ return 0;
}
SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
- int error;
+ struct oldold_utsname tmp = {};
if (!name)
return -EFAULT;
- if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
- return -EFAULT;
down_read(&uts_sem);
- error = __copy_to_user(&name->sysname, &utsname()->sysname,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename, &utsname()->nodename,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->release, &utsname()->release,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->release + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->version, &utsname()->version,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->version + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine, &utsname()->machine,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+ memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
+ memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
+ memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN);
+ memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN);
+ memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN);
up_read(&uts_sem);
+ if (copy_to_user(name, &tmp, sizeof(tmp)))
+ return -EFAULT;
- if (!error && override_architecture(name))
- error = -EFAULT;
- if (!error && override_release(name->release, sizeof(name->release)))
- error = -EFAULT;
- return error ? -EFAULT : 0;
+ if (override_architecture(name))
+ return -EFAULT;
+ if (override_release(name->release, sizeof(name->release)))
+ return -EFAULT;
+ return 0;
}
#endif
@@ -1319,17 +1311,18 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
- down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- struct new_utsname *u = utsname();
+ struct new_utsname *u;
+ down_write(&uts_sem);
+ u = utsname();
memcpy(u->nodename, tmp, len);
memset(u->nodename + len, 0, sizeof(u->nodename) - len);
errno = 0;
uts_proc_notify(UTS_PROC_HOSTNAME);
+ up_write(&uts_sem);
}
- up_write(&uts_sem);
return errno;
}
@@ -1337,8 +1330,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
{
- int i, errno;
+ int i;
struct new_utsname *u;
+ char tmp[__NEW_UTS_LEN + 1];
if (len < 0)
return -EINVAL;
@@ -1347,11 +1341,11 @@ SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
i = 1 + strlen(u->nodename);
if (i > len)
i = len;
- errno = 0;
- if (copy_to_user(name, u->nodename, i))
- errno = -EFAULT;
+ memcpy(tmp, u->nodename, i);
up_read(&uts_sem);
- return errno;
+ if (copy_to_user(name, tmp, i))
+ return -EFAULT;
+ return 0;
}
#endif
@@ -1370,17 +1364,18 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
- down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- struct new_utsname *u = utsname();
+ struct new_utsname *u;
+ down_write(&uts_sem);
+ u = utsname();
memcpy(u->domainname, tmp, len);
memset(u->domainname + len, 0, sizeof(u->domainname) - len);
errno = 0;
uts_proc_notify(UTS_PROC_DOMAINNAME);
+ up_write(&uts_sem);
}
- up_write(&uts_sem);
return errno;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f22f76b7a138..cc02050fd0c4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -145,7 +145,10 @@ static int minolduid;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
-/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
#ifdef CONFIG_DETECT_HUNG_TASK
static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#endif
@@ -222,7 +225,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
-/* Note: sysrq code uses it's own private copy */
+/* Note: sysrq code uses its own private copy */
static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
@@ -1091,6 +1094,14 @@ static struct ctl_table kern_table[] = {
.extra2 = &hung_task_timeout_max,
},
{
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = &hung_task_timeout_max,
+ },
+ {
.procname = "hung_task_warnings",
.data = &sysctl_hung_task_warnings,
.maxlen = sizeof(int),
@@ -1797,6 +1808,24 @@ static struct ctl_table fs_table[] = {
.extra2 = &one,
},
{
+ .procname = "protected_fifos",
+ .data = &sysctl_protected_fifos,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
+ .procname = "protected_regular",
+ .data = &sysctl_protected_regular,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
.procname = "suid_dumpable",
.data = &suid_dumpable,
.maxlen = sizeof(int),
@@ -1965,13 +1994,13 @@ static void warn_sysctl_write(struct ctl_table *table)
}
/**
- * proc_first_pos_non_zero_ignore - check if firs position is allowed
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
* @ppos: file position
* @table: the sysctl table
*
* Returns true if the first position is non-zero and the sysctl_writes_strict
* mode indicates this is not allowed for numeric input types. String proc
- * hadlers can ignore the return value.
+ * handlers can ignore the return value.
*/
static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
struct ctl_table *table)
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index f26acef5d7b4..9a65713c8309 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -139,9 +139,10 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
{
struct signal_struct *sig =
container_of(timer, struct signal_struct, real_timer);
+ struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
- trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
- kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
+ trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
+ kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
return HRTIMER_NORESTART;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 294d7b65af33..ce32cf741b25 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -894,7 +894,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
trace_itimer_expire(signo == SIGPROF ?
ITIMER_PROF : ITIMER_VIRTUAL,
- tsk->signal->leader_pid, cur_time);
+ task_tgid(tsk), cur_time);
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 3ac7295306dc..4b9127e95430 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -86,15 +86,6 @@ static const struct k_clock clock_realtime, clock_monotonic;
#endif
/*
- * parisc wants ENOTSUP instead of EOPNOTSUPP
- */
-#ifndef ENOTSUP
-# define ENANOSLEEP_NOTSUP EOPNOTSUPP
-#else
-# define ENANOSLEEP_NOTSUP ENOTSUP
-#endif
-
-/*
* The timer ID is turned into a timer address by idr_find().
* Verifying a valid ID consists of:
*
@@ -342,8 +333,8 @@ void posixtimer_rearm(struct siginfo *info)
int posix_timer_event(struct k_itimer *timr, int si_private)
{
- struct task_struct *task;
- int shared, ret = -1;
+ enum pid_type type;
+ int ret = -1;
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->posixtimer_rearm().
@@ -357,13 +348,8 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
*/
timr->sigq->info.si_sys_private = si_private;
- rcu_read_lock();
- task = pid_task(timr->it_pid, PIDTYPE_PID);
- if (task) {
- shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
- ret = send_sigqueue(timr->sigq, task, shared);
- }
- rcu_read_unlock();
+ type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
+ ret = send_sigqueue(timr->sigq, timr->it_pid, type);
/* If we failed to send the signal the timer stops. */
return ret > 0;
}
@@ -442,11 +428,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
static struct pid *good_sigevent(sigevent_t * event)
{
- struct task_struct *rtn = current->group_leader;
+ struct pid *pid = task_tgid(current);
+ struct task_struct *rtn;
switch (event->sigev_notify) {
case SIGEV_SIGNAL | SIGEV_THREAD_ID:
- rtn = find_task_by_vpid(event->sigev_notify_thread_id);
+ pid = find_vpid(event->sigev_notify_thread_id);
+ rtn = pid_task(pid, PIDTYPE_PID);
if (!rtn || !same_thread_group(rtn, current))
return NULL;
/* FALLTHRU */
@@ -456,7 +444,7 @@ static struct pid *good_sigevent(sigevent_t * event)
return NULL;
/* FALLTHRU */
case SIGEV_NONE:
- return task_pid(rtn);
+ return pid;
default:
return NULL;
}
@@ -1231,7 +1219,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
if (!kc)
return -EINVAL;
if (!kc->nsleep)
- return -ENANOSLEEP_NOTSUP;
+ return -EOPNOTSUPP;
if (get_timespec64(&t, rqtp))
return -EFAULT;
@@ -1258,7 +1246,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
if (!kc)
return -EINVAL;
if (!kc->nsleep)
- return -ENANOSLEEP_NOTSUP;
+ return -EOPNOTSUPP;
if (compat_get_timespec64(&t, rqtp))
return -EFAULT;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index dcc0166d1997..5e3de28c7677 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -47,6 +47,11 @@ config HAVE_FENTRY
help
Arch supports the gcc options -pg with -mfentry
+config HAVE_NOP_MCOUNT
+ bool
+ help
+ Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -82,6 +87,15 @@ config RING_BUFFER_ALLOW_SWAP
Allow the use of ring_buffer_swap_cpu.
Adds a very slight overhead to tracing when enabled.
+config PREEMPTIRQ_TRACEPOINTS
+ bool
+ depends on TRACE_PREEMPT_TOGGLE || TRACE_IRQFLAGS
+ select TRACING
+ default y
+ help
+ Create preempt/irq toggle tracepoints if needed, so that other parts
+ of the kernel can use them to generate or add hooks to them.
+
# All tracer options should select GENERIC_TRACER. For those options that are
# enabled by all tracers (context switch and event tracer) they select TRACING.
# This allows those options to appear when no other tracer is selected. But the
@@ -155,18 +169,20 @@ config FUNCTION_GRAPH_TRACER
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
+config TRACE_PREEMPT_TOGGLE
+ bool
+ help
+ Enables hooks which will be called when preemption is first disabled,
+ and last enabled.
config PREEMPTIRQ_EVENTS
bool "Enable trace events for preempt and irq disable/enable"
select TRACE_IRQFLAGS
- depends on DEBUG_PREEMPT || !PROVE_LOCKING
- depends on TRACING
+ select TRACE_PREEMPT_TOGGLE if PREEMPT
+ select GENERIC_TRACER
default n
help
Enable tracing of disable and enable events for preemption and irqs.
- For tracing preempt disable/enable events, DEBUG_PREEMPT must be
- enabled. For tracing irq disable/enable events, PROVE_LOCKING must
- be disabled.
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
@@ -203,6 +219,7 @@ config PREEMPT_TRACER
select RING_BUFFER_ALLOW_SWAP
select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
+ select TRACE_PREEMPT_TOGGLE
help
This option measures the time spent in preemption-off critical
sections, with microsecond accuracy.
@@ -456,6 +473,26 @@ config KPROBE_EVENTS
This option is also required by perf-probe subcommand of perf tools.
If you want to use perf tools, this option is strongly recommended.
+config KPROBE_EVENTS_ON_NOTRACE
+ bool "Do NOT protect notrace function from kprobe events"
+ depends on KPROBE_EVENTS
+ depends on KPROBES_ON_FTRACE
+ default n
+ help
+ This is only for the developers who want to debug ftrace itself
+ using kprobe events.
+
+ If kprobes can use ftrace instead of breakpoint, ftrace related
+ functions are protected from kprobe-events to prevent an infinit
+ recursion or any unexpected execution path which leads to a kernel
+ crash.
+
+ This option disables such protection and allows you to put kprobe
+ events on ftrace functions for debugging ftrace by itself.
+ Note that this might let you shoot yourself in the foot.
+
+ If unsure, say N.
+
config UPROBE_EVENTS
bool "Enable uprobes-based dynamic events"
depends on ARCH_SUPPORTS_UPROBES
@@ -521,7 +558,7 @@ config FUNCTION_PROFILER
in debugfs called function_profile_enabled which defaults to zero.
When a 1 is echoed into this file profiling begins, and when a
zero is entered, profiling stops. A "functions" file is created in
- the trace_stats directory; this file shows the list of functions that
+ the trace_stat directory; this file shows the list of functions that
have been hit and their counters.
If in doubt, say N.
@@ -605,7 +642,7 @@ config HIST_TRIGGERS
Inter-event tracing of quantities such as latencies is also
supported using hist triggers under this option.
- See Documentation/trace/histogram.txt.
+ See Documentation/trace/histogram.rst.
If in doubt, say N.
config MMIOTRACE_TEST
@@ -687,6 +724,21 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config PREEMPTIRQ_DELAY_TEST
+ tristate "Preempt / IRQ disable delay thread to test latency tracers"
+ depends on m
+ help
+ Select this option to build a test module that can help test latency
+ tracers by executing a preempt or irq disable section with a user
+ configurable delay. The module busy waits for the duration of the
+ critical section.
+
+ For example, the following invocation forces a one-time irq-disabled
+ critical section for 500us:
+ modprobe preemptirq_delay_test test_mode=irq delay=500000
+
+ If unsure, say N
+
config TRACE_EVAL_MAP_FILE
bool "Show eval mappings for trace events"
depends on TRACING
@@ -722,6 +774,18 @@ config TRACING_EVENTS_GPIO
help
Enable tracing events for gpio subsystem
+config GCOV_PROFILE_FTRACE
+ bool "Enable GCOV profiling on ftrace subsystem"
+ depends on GCOV_KERNEL
+ help
+ Enable GCOV profiling on ftrace subsystem for checking
+ which functions/lines are tested.
+
+ If unsure, say N.
+
+ Note that on a kernel compiled with this config, ftrace will
+ run significantly slower.
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e2538c7638d4..f81dadbc7c4a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -13,11 +13,21 @@ obj-y += trace_selftest_dynamic.o
endif
endif
+ifdef CONFIG_FTRACE_STARTUP_TEST
+CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE)
+obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o
+endif
+
# If unlikely tracing is enabled, do not trace these files
ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+# for GCOV coverage profiling
+ifdef CONFIG_GCOV_PROFILE_FTRACE
+GCOV_PROFILE := y
+endif
+
CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
@@ -33,9 +43,10 @@ obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
+obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
-obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 987d9a9ae283..2868d85f1fb1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
@@ -494,6 +482,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
+ if (!blk_debugfs_root)
+ return -ENOENT;
+
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
@@ -518,9 +509,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -ENOENT;
- if (!blk_debugfs_root)
- goto err;
-
dir = debugfs_lookup(buts->name, blk_debugfs_root);
if (!dir)
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
@@ -1841,6 +1829,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
+ if (!!value == !!q->blk_trace) {
+ ret = 0;
+ goto out_unlock_bdev;
+ }
if (value)
ret = blk_trace_setup_queue(q, bdev);
else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0ae6829804bc..08fcfe440c63 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/types.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index caf9cbf35816..f536f601bd46 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Infrastructure for profiling code inserted by 'gcc -pg'.
*
@@ -157,30 +158,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#endif
}
-/**
- * ftrace_nr_registered_ops - return number of ops registered
- *
- * Returns the number of ftrace_ops registered and tracing functions
- */
-int ftrace_nr_registered_ops(void)
-{
- struct ftrace_ops *ops;
- int cnt = 0;
-
- mutex_lock(&ftrace_lock);
-
- for (ops = rcu_dereference_protected(ftrace_ops_list,
- lockdep_is_held(&ftrace_lock));
- ops != &ftrace_list_end;
- ops = rcu_dereference_protected(ops->next,
- lockdep_is_held(&ftrace_lock)))
- cnt++;
-
- mutex_unlock(&ftrace_lock);
-
- return cnt;
-}
-
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
@@ -313,11 +290,6 @@ static void update_ftrace_function(void)
ftrace_trace_function = func;
}
-int using_ftrace_ops_list_func(void)
-{
- return ftrace_trace_function == ftrace_ops_list_func;
-}
-
static void add_ftrace_ops(struct ftrace_ops __rcu **list,
struct ftrace_ops *ops)
{
@@ -1049,8 +1021,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
#endif /* CONFIG_FUNCTION_PROFILER */
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int ftrace_graph_active;
#else
@@ -2927,22 +2897,22 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
{
/* If ops isn't enabled, ignore it */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return 0;
+ return false;
/* If ops traces all then it includes this function */
if (ops_traces_mod(ops))
- return 1;
+ return true;
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
!__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
- return 0;
+ return false;
/* If in notrace hash, we ignore it too */
if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
- return 0;
+ return false;
- return 1;
+ return true;
}
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
@@ -2981,12 +2951,14 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
p = &pg->records[i];
p->flags = rec_flags;
+#ifndef CC_USING_NOP_MCOUNT
/*
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
if (!ftrace_code_disable(mod, p))
break;
+#endif
update_cnt++;
}
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
new file mode 100644
index 000000000000..f704390db9fc
--- /dev/null
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Preempt / IRQ disable delay thread to test latency tracers
+ *
+ * Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/string.h>
+
+static ulong delay = 100;
+static char test_mode[10] = "irq";
+
+module_param_named(delay, delay, ulong, S_IRUGO);
+module_param_string(test_mode, test_mode, 10, S_IRUGO);
+MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)");
+MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)");
+
+static void busy_wait(ulong time)
+{
+ ktime_t start, end;
+ start = ktime_get();
+ do {
+ end = ktime_get();
+ if (kthread_should_stop())
+ break;
+ } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000));
+}
+
+static int preemptirq_delay_run(void *data)
+{
+ unsigned long flags;
+
+ if (!strcmp(test_mode, "irq")) {
+ local_irq_save(flags);
+ busy_wait(delay);
+ local_irq_restore(flags);
+ } else if (!strcmp(test_mode, "preempt")) {
+ preempt_disable();
+ busy_wait(delay);
+ preempt_enable();
+ }
+
+ return 0;
+}
+
+static int __init preemptirq_delay_init(void)
+{
+ char task_name[50];
+ struct task_struct *test_task;
+
+ snprintf(task_name, sizeof(task_name), "%s_test", test_mode);
+
+ test_task = kthread_run(preemptirq_delay_run, NULL, task_name);
+ return PTR_ERR_OR_ZERO(test_task);
+}
+
+static void __exit preemptirq_delay_exit(void)
+{
+ return;
+}
+
+module_init(preemptirq_delay_init)
+module_exit(preemptirq_delay_exit)
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0b0b688ea166..1d92d4a982fd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generic ring buffer
*
@@ -3221,7 +3222,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on);
*
* Returns true if the ring buffer is in a state that it accepts writes.
*/
-int ring_buffer_record_is_on(struct ring_buffer *buffer)
+bool ring_buffer_record_is_on(struct ring_buffer *buffer)
{
return !atomic_read(&buffer->record_disabled);
}
@@ -3237,7 +3238,7 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer)
* ring_buffer_record_disable(), as that is a temporary disabling of
* the ring buffer.
*/
-int ring_buffer_record_is_set_on(struct ring_buffer *buffer)
+bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
{
return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 68ee79afe31c..ffba6789c0e2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ring buffer tester and benchmark
*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 823687997b01..bf6f1d70484d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ring buffer based function tracer
*
@@ -1087,7 +1088,7 @@ void disable_trace_on_warning(void)
*
* Shows real state of the ring buffer if it is enabled or not.
*/
-int tracer_tracing_is_on(struct trace_array *tr)
+bool tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
return ring_buffer_record_is_on(tr->trace_buffer.buffer);
@@ -7628,7 +7629,9 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
if (buffer) {
mutex_lock(&trace_types_lock);
- if (val) {
+ if (!!val == tracer_tracing_is_on(tr)) {
+ val = 0; /* do nothing */
+ } else if (val) {
tracer_tracing_on(tr);
if (tr->current_trace->start)
tr->current_trace->start(tr);
@@ -8288,6 +8291,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
tracing_off();
local_irq_save(flags);
+ printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -8367,7 +8371,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- atomic_dec(&dump_running);
+ atomic_dec(&dump_running);
+ printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f8f86231ad90..3b8c0e24ab30 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
#ifndef _LINUX_KERNEL_TRACE_H
#define _LINUX_KERNEL_TRACE_H
@@ -594,7 +594,7 @@ void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
-int tracer_tracing_is_on(struct trace_array *tr);
+bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
@@ -937,7 +937,6 @@ void ftrace_destroy_function_files(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
-int using_ftrace_ops_list_func(void);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
void ftrace_init_tracefs_toplevel(struct trace_array *tr,
struct dentry *d_tracer);
@@ -1533,9 +1532,6 @@ extern int event_trigger_init(struct event_trigger_ops *ops,
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
int trigger_enable);
extern void update_cond_flag(struct trace_event_file *file);
-extern void unregister_trigger(char *glob, struct event_trigger_ops *ops,
- struct event_trigger_data *test,
- struct trace_event_file *file);
extern int set_trigger_filter(char *filter_str,
struct event_trigger_data *trigger_data,
struct trace_event_file *file);
@@ -1831,6 +1827,21 @@ static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
}
#endif
+#ifdef CONFIG_PREEMPT_TRACER
+void tracer_preempt_on(unsigned long a0, unsigned long a1);
+void tracer_preempt_off(unsigned long a0, unsigned long a1);
+#else
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+void tracer_hardirqs_on(unsigned long a0, unsigned long a1);
+void tracer_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
+#endif
+
extern struct trace_iterator *tracepoint_print_iter;
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index be1d86ff753d..79e6fbe5b365 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
#undef TRACE_SYSTEM
#define TRACE_SYSTEM benchmark
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index d8a188e0418a..aaf6793ededa 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* tracing clocks
*
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 1d67464ed95e..06bb2fd9a56c 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/*
* This file defines the trace event structures that go into the ring
* buffer directly. They are created via macros so that changes for them
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index c79193e598f5..69a3fe926e8c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace event based perf event profiling/tracing
*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 14ff4ff3caab..f94be0c2827b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* event tracer
*
@@ -239,7 +240,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
struct trace_array_cpu *data;
struct trace_pid_list *pid_list;
- pid_list = rcu_dereference_sched(tr->filtered_pids);
+ pid_list = rcu_dereference_raw(tr->filtered_pids);
if (!pid_list)
return false;
@@ -512,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
struct trace_pid_list *pid_list;
struct trace_array *tr = data;
- pid_list = rcu_dereference_sched(tr->filtered_pids);
+ pid_list = rcu_dereference_raw(tr->filtered_pids);
trace_filter_add_remove_task(pid_list, NULL, task);
}
@@ -636,7 +637,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
rcu_assign_pointer(tr->filtered_pids, NULL);
/* Wait till all users are no longer using pid filtering */
- synchronize_sched();
+ tracepoint_synchronize_unregister();
trace_free_pid_list(pid_list);
}
@@ -1622,7 +1623,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
}
if (filtered_pids) {
- synchronize_sched();
+ tracepoint_synchronize_unregister();
trace_free_pid_list(filtered_pids);
} else if (pid_list) {
/*
@@ -3036,8 +3037,8 @@ int event_trace_del_tracer(struct trace_array *tr)
/* Disable any running events */
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
- /* Access to events are within rcu_read_lock_sched() */
- synchronize_sched();
+ /* Make sure no more events are being executed */
+ tracepoint_synchronize_unregister();
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 893a206bcba4..84a65173b1e9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_events_filter - generic event filtering
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
@@ -899,7 +886,8 @@ int filter_match_preds(struct event_filter *filter, void *rec)
if (!filter)
return 1;
- prog = rcu_dereference_sched(filter->prog);
+ /* Protected by either SRCU(tracepoint_srcu) or preempt_disable */
+ prog = rcu_dereference_raw(filter->prog);
if (!prog)
return 1;
@@ -1626,10 +1614,10 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
/*
* The calls can still be using the old filters.
- * Do a synchronize_sched() to ensure all calls are
+ * Do a synchronize_sched() and to ensure all calls are
* done with them before we free them.
*/
- synchronize_sched();
+ tracepoint_synchronize_unregister();
list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
__free_filter(filter_item->filter);
list_del(&filter_item->list);
@@ -1648,7 +1636,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
kfree(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
- synchronize_sched();
+ tracepoint_synchronize_unregister();
list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
__free_filter(filter_item->filter);
list_del(&filter_item->list);
@@ -1790,7 +1778,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_clear_filter(file);
/* Make sure the filter is not being used */
- synchronize_sched();
+ tracepoint_synchronize_unregister();
__free_filter(filter);
return 0;
@@ -1817,7 +1805,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
if (tmp) {
/* Make sure the call is done with the filter */
- synchronize_sched();
+ tracepoint_synchronize_unregister();
__free_filter(tmp);
}
}
@@ -1847,7 +1835,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
- synchronize_sched();
+ tracepoint_synchronize_unregister();
filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
goto out_unlock;
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
index 39d7ef4f57cb..e651dfbd345e 100644
--- a/kernel/trace/trace_events_filter_test.h
+++ b/kernel/trace/trace_events_filter_test.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
#undef TRACE_SYSTEM
#define TRACE_SYSTEM test
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index aae18af94c94..85f6b01431c7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_events_hist - trace event hist triggers
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
*/
@@ -5141,7 +5132,7 @@ static void hist_clear(struct event_trigger_data *data)
if (data->name)
pause_named_trigger(data);
- synchronize_sched();
+ tracepoint_synchronize_unregister();
tracing_map_clear(hist_data->map);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 5dea177cef53..2152d1e530cb 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_events_trigger - trace event triggers
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
*/
@@ -34,7 +21,9 @@ void trigger_data_free(struct event_trigger_data *data)
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
- synchronize_sched(); /* make sure current triggers exit before free */
+ /* make sure current triggers exit before free */
+ tracepoint_synchronize_unregister();
+
kfree(data);
}
@@ -579,9 +568,9 @@ out:
* Usually used directly as the @unreg method in event command
* implementations.
*/
-void unregister_trigger(char *glob, struct event_trigger_ops *ops,
- struct event_trigger_data *test,
- struct trace_event_file *file)
+static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
{
struct event_trigger_data *data;
bool unregistered = false;
@@ -752,7 +741,7 @@ int set_trigger_filter(char *filter_str,
if (tmp) {
/* Make sure the call is done with the filter */
- synchronize_sched();
+ tracepoint_synchronize_unregister();
free_event_filter(tmp);
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index d7c8e4ec3d9d..1e6db9cbe4dc 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_hwlatdetect.c - A simple Hardware Latency detector.
*
@@ -35,9 +36,6 @@
*
* Includes useful feedback from Clark Williams <clark@redhat.com>
*
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
*/
#include <linux/kthread.h>
#include <linux/tracefs.h>
@@ -354,6 +352,9 @@ static int start_kthread(struct trace_array *tr)
struct task_struct *kthread;
int next_cpu;
+ if (WARN_ON(hwlat_kthread))
+ return 0;
+
/* Just pick the first CPU on first iteration */
current_mask = &save_cpumask;
get_online_cpus();
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 03ecb4465ee4..b7357f9f82a3 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace irqs off critical timings
*
@@ -16,7 +17,6 @@
#include "trace.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>
#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
@@ -41,12 +41,12 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph);
#ifdef CONFIG_PREEMPT_TRACER
static inline int
-preempt_trace(void)
+preempt_trace(int pc)
{
- return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+ return ((trace_type & TRACER_PREEMPT_OFF) && pc);
}
#else
-# define preempt_trace() (0)
+# define preempt_trace(pc) (0)
#endif
#ifdef CONFIG_IRQSOFF_TRACER
@@ -367,7 +367,7 @@ out:
}
static inline void
-start_critical_timing(unsigned long ip, unsigned long parent_ip)
+start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
@@ -395,7 +395,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
local_save_flags(flags);
- __trace_function(tr, ip, parent_ip, flags, preempt_count());
+ __trace_function(tr, ip, parent_ip, flags, pc);
per_cpu(tracing_cpu, cpu) = 1;
@@ -403,7 +403,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
}
static inline void
-stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
struct trace_array *tr = irqsoff_trace;
@@ -429,7 +429,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
atomic_inc(&data->disabled);
local_save_flags(flags);
- __trace_function(tr, ip, parent_ip, flags, preempt_count());
+ __trace_function(tr, ip, parent_ip, flags, pc);
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -438,77 +438,21 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
/* start and stop critical timings used to for stoppage (in idle) */
void start_critical_timings(void)
{
- if (preempt_trace() || irq_trace())
- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+ int pc = preempt_count();
+
+ if (preempt_trace(pc) || irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
void stop_critical_timings(void)
{
- if (preempt_trace() || irq_trace())
- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
-}
-EXPORT_SYMBOL_GPL(stop_critical_timings);
-
-#ifdef CONFIG_IRQSOFF_TRACER
-#ifdef CONFIG_PROVE_LOCKING
-void time_hardirqs_on(unsigned long a0, unsigned long a1)
-{
- if (!preempt_trace() && irq_trace())
- stop_critical_timing(a0, a1);
-}
-
-void time_hardirqs_off(unsigned long a0, unsigned long a1)
-{
- if (!preempt_trace() && irq_trace())
- start_critical_timing(a0, a1);
-}
-
-#else /* !CONFIG_PROVE_LOCKING */
-
-/*
- * We are only interested in hardirq on/off events:
- */
-static inline void tracer_hardirqs_on(void)
-{
- if (!preempt_trace() && irq_trace())
- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
-}
+ int pc = preempt_count();
-static inline void tracer_hardirqs_off(void)
-{
- if (!preempt_trace() && irq_trace())
- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
-}
-
-static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
-{
- if (!preempt_trace() && irq_trace())
- stop_critical_timing(CALLER_ADDR0, caller_addr);
-}
-
-static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
-{
- if (!preempt_trace() && irq_trace())
- start_critical_timing(CALLER_ADDR0, caller_addr);
-}
-
-#endif /* CONFIG_PROVE_LOCKING */
-#endif /* CONFIG_IRQSOFF_TRACER */
-
-#ifdef CONFIG_PREEMPT_TRACER
-static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
-{
- if (preempt_trace() && !irq_trace())
- stop_critical_timing(a0, a1);
-}
-
-static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
-{
- if (preempt_trace() && !irq_trace())
- start_critical_timing(a0, a1);
+ if (preempt_trace(pc) || irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
-#endif /* CONFIG_PREEMPT_TRACER */
+EXPORT_SYMBOL_GPL(stop_critical_timings);
#ifdef CONFIG_FUNCTION_TRACER
static bool function_enabled;
@@ -634,7 +578,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
return 0;
}
-static void irqsoff_tracer_reset(struct trace_array *tr)
+static void __irqsoff_tracer_reset(struct trace_array *tr)
{
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
@@ -659,12 +603,37 @@ static void irqsoff_tracer_stop(struct trace_array *tr)
}
#ifdef CONFIG_IRQSOFF_TRACER
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+ unsigned int pc = preempt_count();
+
+ if (!preempt_trace(pc) && irq_trace())
+ stop_critical_timing(a0, a1, pc);
+}
+
+void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+ unsigned int pc = preempt_count();
+
+ if (!preempt_trace(pc) && irq_trace())
+ start_critical_timing(a0, a1, pc);
+}
+
static int irqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF;
return __irqsoff_tracer_init(tr);
}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+ __irqsoff_tracer_reset(tr);
+}
+
static struct tracer irqsoff_tracer __read_mostly =
{
.name = "irqsoff",
@@ -684,12 +653,25 @@ static struct tracer irqsoff_tracer __read_mostly =
.allow_instances = true,
.use_max_tr = true,
};
-# define register_irqsoff(trace) register_tracer(&trace)
-#else
-# define register_irqsoff(trace) do { } while (0)
-#endif
+#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
+void tracer_preempt_on(unsigned long a0, unsigned long a1)
+{
+ int pc = preempt_count();
+
+ if (preempt_trace(pc) && !irq_trace())
+ stop_critical_timing(a0, a1, pc);
+}
+
+void tracer_preempt_off(unsigned long a0, unsigned long a1)
+{
+ int pc = preempt_count();
+
+ if (preempt_trace(pc) && !irq_trace())
+ start_critical_timing(a0, a1, pc);
+}
+
static int preemptoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_PREEMPT_OFF;
@@ -697,11 +679,16 @@ static int preemptoff_tracer_init(struct trace_array *tr)
return __irqsoff_tracer_init(tr);
}
+static void preemptoff_tracer_reset(struct trace_array *tr)
+{
+ __irqsoff_tracer_reset(tr);
+}
+
static struct tracer preemptoff_tracer __read_mostly =
{
.name = "preemptoff",
.init = preemptoff_tracer_init,
- .reset = irqsoff_tracer_reset,
+ .reset = preemptoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = true,
@@ -716,13 +703,9 @@ static struct tracer preemptoff_tracer __read_mostly =
.allow_instances = true,
.use_max_tr = true,
};
-# define register_preemptoff(trace) register_tracer(&trace)
-#else
-# define register_preemptoff(trace) do { } while (0)
-#endif
+#endif /* CONFIG_PREEMPT_TRACER */
-#if defined(CONFIG_IRQSOFF_TRACER) && \
- defined(CONFIG_PREEMPT_TRACER)
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
static int preemptirqsoff_tracer_init(struct trace_array *tr)
{
@@ -731,11 +714,16 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
return __irqsoff_tracer_init(tr);
}
+static void preemptirqsoff_tracer_reset(struct trace_array *tr)
+{
+ __irqsoff_tracer_reset(tr);
+}
+
static struct tracer preemptirqsoff_tracer __read_mostly =
{
.name = "preemptirqsoff",
.init = preemptirqsoff_tracer_init,
- .reset = irqsoff_tracer_reset,
+ .reset = preemptirqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = true,
@@ -750,115 +738,21 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.allow_instances = true,
.use_max_tr = true,
};
-
-# define register_preemptirqsoff(trace) register_tracer(&trace)
-#else
-# define register_preemptirqsoff(trace) do { } while (0)
#endif
__init static int init_irqsoff_tracer(void)
{
- register_irqsoff(irqsoff_tracer);
- register_preemptoff(preemptoff_tracer);
- register_preemptirqsoff(preemptirqsoff_tracer);
-
- return 0;
-}
-core_initcall(init_irqsoff_tracer);
-#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
-
-#ifndef CONFIG_IRQSOFF_TRACER
-static inline void tracer_hardirqs_on(void) { }
-static inline void tracer_hardirqs_off(void) { }
-static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
-static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#ifdef CONFIG_IRQSOFF_TRACER
+ register_tracer(&irqsoff_tracer);
#endif
-
-#ifndef CONFIG_PREEMPT_TRACER
-static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
-static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#ifdef CONFIG_PREEMPT_TRACER
+ register_tracer(&preemptoff_tracer);
#endif
-
-#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
-/* Per-cpu variable to prevent redundant calls when IRQs already off */
-static DEFINE_PER_CPU(int, tracing_irq_cpu);
-
-void trace_hardirqs_on(void)
-{
- if (!this_cpu_read(tracing_irq_cpu))
- return;
-
- trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
- tracer_hardirqs_on();
-
- this_cpu_write(tracing_irq_cpu, 0);
-}
-EXPORT_SYMBOL(trace_hardirqs_on);
-
-void trace_hardirqs_off(void)
-{
- if (this_cpu_read(tracing_irq_cpu))
- return;
-
- this_cpu_write(tracing_irq_cpu, 1);
-
- trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
- tracer_hardirqs_off();
-}
-EXPORT_SYMBOL(trace_hardirqs_off);
-
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
-{
- if (!this_cpu_read(tracing_irq_cpu))
- return;
-
- trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
- tracer_hardirqs_on_caller(caller_addr);
-
- this_cpu_write(tracing_irq_cpu, 0);
-}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
-{
- if (this_cpu_read(tracing_irq_cpu))
- return;
-
- this_cpu_write(tracing_irq_cpu, 1);
-
- trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
- tracer_hardirqs_off_caller(caller_addr);
-}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-
-/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+ register_tracer(&preemptirqsoff_tracer);
#endif
-#if defined(CONFIG_PREEMPT_TRACER) || \
- (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
-void trace_preempt_on(unsigned long a0, unsigned long a1)
-{
- trace_preempt_enable_rcuidle(a0, a1);
- tracer_preempt_on(a0, a1);
-}
-
-void trace_preempt_off(unsigned long a0, unsigned long a1)
-{
- trace_preempt_disable_rcuidle(a0, a1);
- tracer_preempt_off(a0, a1);
+ return 0;
}
-#endif
+core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index e9d99463e5df..c30032367aab 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1,20 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Kprobes-based tracing events
*
* Created by Masami Hiramatsu <mhiramat@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "trace_kprobe: " fmt
@@ -23,6 +12,7 @@
#include <linux/rculist.h>
#include <linux/error-injection.h>
+#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
@@ -87,6 +77,23 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
return nhit;
}
+/* Return 0 if it fails to find the symbol address */
+static nokprobe_inline
+unsigned long trace_kprobe_address(struct trace_kprobe *tk)
+{
+ unsigned long addr;
+
+ if (tk->symbol) {
+ addr = (unsigned long)
+ kallsyms_lookup_name(trace_kprobe_symbol(tk));
+ if (addr)
+ addr += tk->rp.kp.offset;
+ } else {
+ addr = (unsigned long)tk->rp.kp.addr;
+ }
+ return addr;
+}
+
bool trace_kprobe_on_func_entry(struct trace_event_call *call)
{
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
@@ -99,16 +106,8 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call)
bool trace_kprobe_error_injectable(struct trace_event_call *call)
{
struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
- unsigned long addr;
- if (tk->symbol) {
- addr = (unsigned long)
- kallsyms_lookup_name(trace_kprobe_symbol(tk));
- addr += tk->rp.kp.offset;
- } else {
- addr = (unsigned long)tk->rp.kp.addr;
- }
- return within_error_injection_list(addr);
+ return within_error_injection_list(trace_kprobe_address(tk));
}
static int register_kprobe_event(struct trace_kprobe *tk);
@@ -393,6 +392,20 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
return NULL;
}
+static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
+{
+ int ret = 0;
+
+ if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
+ if (trace_kprobe_is_return(tk))
+ ret = enable_kretprobe(&tk->rp);
+ else
+ ret = enable_kprobe(&tk->rp.kp);
+ }
+
+ return ret;
+}
+
/*
* Enable trace_probe
* if the file is NULL, enable "perf" handler, or enable "trace" handler.
@@ -400,7 +413,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
static int
enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
- struct event_file_link *link = NULL;
+ struct event_file_link *link;
int ret = 0;
if (file) {
@@ -414,26 +427,18 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
list_add_tail_rcu(&link->list, &tk->tp.files);
tk->tp.flags |= TP_FLAG_TRACE;
- } else
- tk->tp.flags |= TP_FLAG_PROFILE;
-
- if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
- if (trace_kprobe_is_return(tk))
- ret = enable_kretprobe(&tk->rp);
- else
- ret = enable_kprobe(&tk->rp.kp);
- }
-
- if (ret) {
- if (file) {
- /* Notice the if is true on not WARN() */
- if (!WARN_ON_ONCE(!link))
- list_del_rcu(&link->list);
+ ret = __enable_trace_kprobe(tk);
+ if (ret) {
+ list_del_rcu(&link->list);
kfree(link);
tk->tp.flags &= ~TP_FLAG_TRACE;
- } else {
- tk->tp.flags &= ~TP_FLAG_PROFILE;
}
+
+ } else {
+ tk->tp.flags |= TP_FLAG_PROFILE;
+ ret = __enable_trace_kprobe(tk);
+ if (ret)
+ tk->tp.flags &= ~TP_FLAG_PROFILE;
}
out:
return ret;
@@ -498,6 +503,29 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
return ret;
}
+#if defined(CONFIG_KPROBES_ON_FTRACE) && \
+ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
+static bool within_notrace_func(struct trace_kprobe *tk)
+{
+ unsigned long offset, size, addr;
+
+ addr = trace_kprobe_address(tk);
+ if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset))
+ return false;
+
+ /* Get the entry address of the target function */
+ addr -= offset;
+
+ /*
+ * Since ftrace_location_range() does inclusive range check, we need
+ * to subtract 1 byte from the end address.
+ */
+ return !ftrace_location_range(addr, addr + size - 1);
+}
+#else
+#define within_notrace_func(tk) (false)
+#endif
+
/* Internal register function - just handle k*probes and flags */
static int __register_trace_kprobe(struct trace_kprobe *tk)
{
@@ -506,6 +534,12 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
if (trace_probe_is_registered(&tk->tp))
return -EINVAL;
+ if (within_notrace_func(tk)) {
+ pr_warn("Could not probe notrace function %s\n",
+ trace_kprobe_symbol(tk));
+ return -EINVAL;
+ }
+
for (i = 0; i < tk->tp.nr_args; i++)
traceprobe_update_arg(&tk->tp.args[i]);
@@ -1547,17 +1581,6 @@ fs_initcall(init_kprobe_trace);
#ifdef CONFIG_FTRACE_STARTUP_TEST
-/*
- * The "__used" keeps gcc from removing the function symbol
- * from the kallsyms table. 'noinline' makes sure that there
- * isn't an inlined version used by the test method below
- */
-static __used __init noinline int
-kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
-{
- return a1 + a2 + a3 + a4 + a5 + a6;
-}
-
static __init struct trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c
new file mode 100644
index 000000000000..16548ee4c8c6
--- /dev/null
+++ b/kernel/trace/trace_kprobe_selftest.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Function used during the kprobe self test. This function is in a separate
+ * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it
+ * can be probed by the selftests.
+ */
+int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
+{
+ return a1 + a2 + a3 + a4 + a5 + a6;
+}
diff --git a/kernel/trace/trace_kprobe_selftest.h b/kernel/trace/trace_kprobe_selftest.h
new file mode 100644
index 000000000000..c4fc7268ba7c
--- /dev/null
+++ b/kernel/trace/trace_kprobe_selftest.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Function used during the kprobe self test. This function is in a separate
+ * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it
+ * can be probed by the selftests.
+ */
+int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1c8e30fda46a..6e6cc64faa38 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_output.c
*
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index dbba03ed96de..2f742b74e7e6 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
#ifndef __TRACE_EVENTS_H
#define __TRACE_EVENTS_H
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
new file mode 100644
index 000000000000..71f553cceb3c
--- /dev/null
+++ b/kernel/trace/trace_preemptirq.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * preemptoff and irqoff tracepoints
+ *
+ * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org>
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include "trace.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+void trace_hardirqs_on(void)
+{
+ if (this_cpu_read(tracing_irq_cpu)) {
+ if (!in_nmi())
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
+ this_cpu_write(tracing_irq_cpu, 0);
+ }
+
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu)) {
+ this_cpu_write(tracing_irq_cpu, 1);
+ tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
+ if (!in_nmi())
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ }
+
+ lockdep_hardirqs_off(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu)) {
+ if (!in_nmi())
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
+ this_cpu_write(tracing_irq_cpu, 0);
+ }
+
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu)) {
+ this_cpu_write(tracing_irq_cpu, 1);
+ tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
+ if (!in_nmi())
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ }
+
+ lockdep_hardirqs_off(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
+
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ if (!in_nmi())
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ if (!in_nmi())
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 50f44b7b2b32..b0875b327f5c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace binary printk
*
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index daf54bda4dc8..e99c3ce7aa65 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Common code for probe-based Dynamic events.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* This code was copied from kernel/trace/trace_kprobe.c written by
* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
*
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 75daff22ccea..5f52668e165d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Common header file for probe-based Dynamic events.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* This code was copied from kernel/trace/trace_kprobe.h written by
* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
*
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index e694c9f9efa4..6b1c562ffdaf 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* trace_seq.c
*
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 76d30b4ebe83..8786d17caf49 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
#ifndef __TRACE_STAT_H
#define __TRACE_STAT_H
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index bf89a51e740d..e696667da29a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* uprobes-based tracing events
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
@@ -952,7 +940,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
list_del_rcu(&link->list);
/* synchronize with u{,ret}probe_trace_func */
- synchronize_sched();
+ synchronize_rcu();
kfree(link);
if (!list_empty(&tu->tp.files))
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 752d8042bad4..9a1c22310323 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* tracing_map - lock-free map for tracing
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
*
* tracing_map implementation inspired by lock-free map algorithms
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 053eb92b2d31..a6de61fc22de 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
#ifndef __TRACING_MAP_H
#define __TRACING_MAP_H
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6dc6356c3327..bf2c06ef9afc 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -31,6 +31,9 @@
extern struct tracepoint * const __start___tracepoints_ptrs[];
extern struct tracepoint * const __stop___tracepoints_ptrs[];
+DEFINE_SRCU(tracepoint_srcu);
+EXPORT_SYMBOL_GPL(tracepoint_srcu);
+
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -50,6 +53,9 @@ static LIST_HEAD(tracepoint_module_list);
*/
static DEFINE_MUTEX(tracepoints_mutex);
+static struct rcu_head *early_probes;
+static bool ok_to_free_tracepoints;
+
/*
* Note about RCU :
* It is used to delay the free of multiple probes array until a quiescent
@@ -67,16 +73,56 @@ static inline void *allocate_probes(int count)
return p == NULL ? NULL : p->probes;
}
-static void rcu_free_old_probes(struct rcu_head *head)
+static void srcu_free_old_probes(struct rcu_head *head)
{
kfree(container_of(head, struct tp_probes, rcu));
}
+static void rcu_free_old_probes(struct rcu_head *head)
+{
+ call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
+}
+
+static __init int release_early_probes(void)
+{
+ struct rcu_head *tmp;
+
+ ok_to_free_tracepoints = true;
+
+ while (early_probes) {
+ tmp = early_probes;
+ early_probes = tmp->next;
+ call_rcu_sched(tmp, rcu_free_old_probes);
+ }
+
+ return 0;
+}
+
+/* SRCU is initialized at core_initcall */
+postcore_initcall(release_early_probes);
+
static inline void release_probes(struct tracepoint_func *old)
{
if (old) {
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
+
+ /*
+ * We can't free probes if SRCU is not initialized yet.
+ * Postpone the freeing till after SRCU is initialized.
+ */
+ if (unlikely(!ok_to_free_tracepoints)) {
+ tp_probes->rcu.next = early_probes;
+ early_probes = &tp_probes->rcu;
+ return;
+ }
+
+ /*
+ * Tracepoint probes are protected by both sched RCU and SRCU,
+ * by calling the SRCU callback in the sched RCU callback we
+ * cover both cases. So let us chain the SRCU and sched RCU
+ * callbacks to wait for both grace periods.
+ */
call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
}
}
@@ -325,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
}
EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end,
+ void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ if (!begin)
+ return;
+
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) {
+ const int *iter;
+
+ for (iter = (const int *)begin; iter < (const int *)end; iter++)
+ fct(offset_to_ptr(iter), priv);
+ } else {
+ struct tracepoint * const *iter;
+
+ for (iter = begin; iter < end; iter++)
+ fct(*iter, priv);
+ }
+}
+
#ifdef CONFIG_MODULES
bool trace_module_has_bad_taint(struct module *mod)
{
@@ -389,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
* Ensure the tracer unregistered the module's probes before the module
* teardown is performed. Prevents leaks of probe and data pointers.
*/
-static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
- struct tracepoint * const *end)
+static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
{
- struct tracepoint * const *iter;
-
- if (!begin)
- return;
- for (iter = begin; iter < end; iter++)
- WARN_ON_ONCE((*iter)->funcs);
+ WARN_ON_ONCE(tp->funcs);
}
static int tracepoint_module_coming(struct module *mod)
@@ -448,8 +509,9 @@ static void tracepoint_module_going(struct module *mod)
* Called the going notifier before checking for
* quiescence.
*/
- tp_module_going_check_quiescent(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
+ for_each_tracepoint_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints,
+ tp_module_going_check_quiescent, NULL);
break;
}
}
@@ -501,19 +563,6 @@ static __init int init_tracepoints(void)
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
-static void for_each_tracepoint_range(struct tracepoint * const *begin,
- struct tracepoint * const *end,
- void (*fct)(struct tracepoint *tp, void *priv),
- void *priv)
-{
- struct tracepoint * const *iter;
-
- if (!begin)
- return;
- for (iter = begin; iter < end; iter++)
- fct(*iter, priv);
-}
-
/**
* for_each_kernel_tracepoint - iteration on all kernel tracepoints
* @fct: callback
diff --git a/kernel/user.c b/kernel/user.c
index 36288d840675..0df9b1640b2a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
- .__count = ATOMIC_INIT(1),
+ .__count = REFCOUNT_INIT(1),
.processes = ATOMIC_INIT(1),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
@@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
hlist_for_each_entry(user, hashent, uidhash_node) {
if (uid_eq(user->uid, uid)) {
- atomic_inc(&user->__count);
+ refcount_inc(&user->__count);
return user;
}
}
@@ -169,11 +169,8 @@ void free_uid(struct user_struct *up)
if (!up)
return;
- local_irq_save(flags);
- if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+ if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
free_user(up, flags);
- else
- local_irq_restore(flags);
}
struct user_struct *alloc_uid(kuid_t uid)
@@ -191,7 +188,7 @@ struct user_struct *alloc_uid(kuid_t uid)
goto out_unlock;
new->uid = uid;
- atomic_set(&new->__count, 1);
+ refcount_set(&new->__count, 1);
ratelimit_state_init(&new->ratelimit, HZ, 100);
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c3d7583fcd21..e5222b5fb4fe 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -859,7 +859,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,
unsigned idx;
struct uid_gid_extent extent;
char *kbuf = NULL, *pos, *next_line;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ /* Only allow < page size writes at the beginning of the file */
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ return -EINVAL;
+
+ /* Slurp in the user data */
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/*
* The userns_state_mutex serializes all writes to any given map.
@@ -895,19 +904,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
- /* Only allow < page size writes at the beginning of the file */
- ret = -EINVAL;
- if ((*ppos != 0) || (count >= PAGE_SIZE))
- goto out;
-
- /* Slurp in the user data */
- kbuf = memdup_user_nul(buf, count);
- if (IS_ERR(kbuf)) {
- ret = PTR_ERR(kbuf);
- kbuf = NULL;
- goto out;
- }
-
/* Parse the user data */
ret = -EINVAL;
pos = kbuf;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 233cd8fc6910..258033d62cb3 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -18,7 +18,7 @@
#ifdef CONFIG_PROC_SYSCTL
-static void *get_uts(struct ctl_table *table, int write)
+static void *get_uts(struct ctl_table *table)
{
char *which = table->data;
struct uts_namespace *uts_ns;
@@ -26,21 +26,9 @@ static void *get_uts(struct ctl_table *table, int write)
uts_ns = current->nsproxy->uts_ns;
which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
- if (!write)
- down_read(&uts_sem);
- else
- down_write(&uts_sem);
return which;
}
-static void put_uts(struct ctl_table *table, int write, void *which)
-{
- if (!write)
- up_read(&uts_sem);
- else
- up_write(&uts_sem);
-}
-
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
@@ -50,13 +38,34 @@ static int proc_do_uts_string(struct ctl_table *table, int write,
{
struct ctl_table uts_table;
int r;
+ char tmp_data[__NEW_UTS_LEN + 1];
+
memcpy(&uts_table, table, sizeof(uts_table));
- uts_table.data = get_uts(table, write);
+ uts_table.data = tmp_data;
+
+ /*
+ * Buffer the value in tmp_data so that proc_dostring() can be called
+ * without holding any locks.
+ * We also need to read the original value in the write==1 case to
+ * support partial writes.
+ */
+ down_read(&uts_sem);
+ memcpy(tmp_data, get_uts(table), sizeof(tmp_data));
+ up_read(&uts_sem);
r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
- put_uts(table, write, uts_table.data);
- if (write)
+ if (write) {
+ /*
+ * Write back the new value.
+ * Note that, since we dropped uts_sem, the result can
+ * theoretically be incorrect if there are two parallel writes
+ * at non-zero offsets to the same sysctl.
+ */
+ down_write(&uts_sem);
+ memcpy(get_uts(table), tmp_data, sizeof(tmp_data));
+ up_write(&uts_sem);
proc_sys_poll_notify(table->poll);
+ }
return r;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 78b192071ef7..60e80198c3df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2652,6 +2652,9 @@ void flush_workqueue(struct workqueue_struct *wq)
if (WARN_ON(!wq_online))
return;
+ lock_map_acquire(&wq->lockdep_map);
+ lock_map_release(&wq->lockdep_map);
+
mutex_lock(&wq->mutex);
/*
@@ -2843,7 +2846,8 @@ reflush:
}
EXPORT_SYMBOL_GPL(drain_workqueue);
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
+ bool from_cancel)
{
struct worker *worker = NULL;
struct worker_pool *pool;
@@ -2885,7 +2889,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
* workqueues the deadlock happens when the rescuer stalls, blocking
* forward progress.
*/
- if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
+ if (!from_cancel &&
+ (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
}
@@ -2896,6 +2901,27 @@ already_gone:
return false;
}
+static bool __flush_work(struct work_struct *work, bool from_cancel)
+{
+ struct wq_barrier barr;
+
+ if (WARN_ON(!wq_online))
+ return false;
+
+ if (!from_cancel) {
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+ }
+
+ if (start_flush_work(work, &barr, from_cancel)) {
+ wait_for_completion(&barr.done);
+ destroy_work_on_stack(&barr.work);
+ return true;
+ } else {
+ return false;
+ }
+}
+
/**
* flush_work - wait for a work to finish executing the last queueing instance
* @work: the work to flush
@@ -2909,18 +2935,7 @@ already_gone:
*/
bool flush_work(struct work_struct *work)
{
- struct wq_barrier barr;
-
- if (WARN_ON(!wq_online))
- return false;
-
- if (start_flush_work(work, &barr)) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- return true;
- } else {
- return false;
- }
+ return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);
@@ -2986,7 +3001,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
* isn't executing.
*/
if (wq_online)
- flush_work(work);
+ __flush_work(work, true);
clear_work_data(work);