From f96f56780ca584930bb3a2769d73fd9a101bcbbe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 4 Aug 2014 03:10:16 +0000 Subject: kprobes: Skip kretprobe hit in NMI context to avoid deadlock Skip kretprobe hit in NMI context, because if an NMI happens inside the critical section protected by kretprobe_table.lock and another(or same) kretprobe hit, pre_kretprobe_handler tries to lock kretprobe_table.lock again. Normal interrupts have no problem because they are disabled with the lock. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: David S. Miller Link: http://lkml.kernel.org/r/20140804031016.11433.65539.stgit@kbuild-fedora.novalocal [ Minor edits for clarity. ] Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7d280b..3995f546d0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /*TODO: consider to only swap the RA after the last pre_handler fired */ + /* + * To avoid deadlocks, prohibit return probing in NMI contexts, + * just skip the probe and increase the (inexact) 'nmissed' + * statistical counter, so that the user is informed that + * something happened: + */ + if (unlikely(in_nmi())) { + rp->nmissed++; + return 0; + } + + /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { -- cgit v1.2.3 From 71b1fb5c4473a5b1e601d41b109bdfe001ec82e0 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 18 Aug 2014 12:20:20 +0100 Subject: cgroup: reject cgroup names with '\n' /proc//cgroup contains one cgroup path on each line. If cgroup names are allowed to contain "\n", applications cannot parse /proc//cgroup safely. Signed-off-by: Alban Crequy Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..c3d1802a9b30 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4543,6 +4543,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cftype *base_files; int ssid, ret; + /* Do not accept '\n' to prevent making /proc//cgroup unparsable. + */ + if (strchr(name, '\n')) + return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; -- cgit v1.2.3 From b3f207855f57b9c8f43a547a801340bb5cbc59e5 Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Fri, 13 Jun 2014 16:03:32 +0100 Subject: perf: Handle compat ioctl When running a 32-bit userspace on a 64-bit kernel (eg. i386 application on x86_64 kernel or 32-bit arm userspace on arm64 kernel) some of the perf ioctls must be treated with special care, as they have a pointer size encoded in the command. For example, PERF_EVENT_IOC_ID in 32-bit world will be encoded as 0x80042407, but 64-bit kernel will expect 0x80082407. In result the ioctl will fail returning -ENOTTY. This patch solves the problem by adding code fixing up the size as compat_ioctl file operation. Reported-by: Drew Richardson Signed-off-by: Pawel Moll Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1402671812-9078-1-git-send-email-pawel.moll@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..f9c1ed002dbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "internal.h" @@ -3717,6 +3718,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +#ifdef CONFIG_COMPAT +static long perf_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): + case _IOC_NR(PERF_EVENT_IOC_ID): + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { + cmd &= ~IOCSIZE_MASK; + cmd |= sizeof(void *) << IOCSIZE_SHIFT; + } + break; + } + return perf_ioctl(file, cmd, arg); +} +#else +# define perf_compat_ioctl NULL +#endif + int perf_event_task_enable(void) { struct perf_event *event; @@ -4222,7 +4243,7 @@ static const struct file_operations perf_fops = { .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, + .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; -- cgit v1.2.3 From 33b7f99cf003ca6c1d31c42b50e1100ad71aaec0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Aug 2014 17:23:02 -0400 Subject: ftrace: Allow ftrace_ops to use the hashes from other ops Currently the top level debug file system function tracer shares its ftrace_ops with the function graph tracer. This was thought to be fine because the tracers are not used together, as one can only enable function or function_graph tracer in the current_tracer file. But that assumption proved to be incorrect. The function profiler can use the function graph tracer when function tracing is enabled. Since all function graph users uses the function tracing ftrace_ops this causes a conflict and when a user enables both function profiling as well as the function tracer it will crash ftrace and disable it. The quick solution so far is to move them as separate ftrace_ops like it was earlier. The problem though is to synchronize the functions that are traced because both function and function_graph tracer are limited by the selections made in the set_ftrace_filter and set_ftrace_notrace files. To handle this, a new structure is made called ftrace_ops_hash. This structure will now hold the filter_hash and notrace_hash, and the ftrace_ops will point to this structure. That will allow two ftrace_ops to share the same hashes. Since most ftrace_ops do not share the hashes, and to keep allocation simple, the ftrace_ops structure will include both a pointer to the ftrace_ops_hash called func_hash, as well as the structure itself, called local_hash. When the ops are registered, the func_hash pointer will be initialized to point to the local_hash within the ftrace_ops structure. Some of the ftrace internal ftrace_ops will be initialized statically. This will allow for the function and function_graph tracer to have separate ops but still share the same hash tables that determine what functions they trace. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 14 +++++-- kernel/trace/ftrace.c | 100 +++++++++++++++++++++++++------------------------ 2 files changed, 63 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6bb5e3f2a3b4..f0b0edbf55a9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -102,6 +102,15 @@ enum { FTRACE_OPS_FL_DELETED = 1 << 8, }; +#ifdef CONFIG_DYNAMIC_FTRACE +/* The hash used to know what functions callbacks trace */ +struct ftrace_ops_hash { + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; + struct mutex regex_lock; +}; +#endif + /* * Note, ftrace_ops can be referenced outside of RCU protection. * (Although, for perf, the control ops prevent that). If ftrace_ops is @@ -121,10 +130,9 @@ struct ftrace_ops { int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE int nr_trampolines; - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_ops_hash local_hash; + struct ftrace_ops_hash *func_hash; struct ftrace_hash *tramp_hash; - struct mutex regex_lock; unsigned long trampoline; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1654b12c891a..c92757adba79 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -65,15 +65,17 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE -#define INIT_REGEX_LOCK(opsname) \ - .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else -#define INIT_REGEX_LOCK(opsname) +#define INIT_OPS_HASH(opsname) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -140,7 +142,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { - mutex_init(&ops->regex_lock); + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; ops->flags |= FTRACE_OPS_FL_INITIALIZED; } #endif @@ -899,7 +902,7 @@ static void unregister_ftrace_profiler(void) static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(ftrace_profile_ops) + INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -1081,11 +1084,12 @@ static const struct ftrace_hash empty_hash = { #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) static struct ftrace_ops global_ops = { - .func = ftrace_stub, - .notrace_hash = EMPTY_HASH, - .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED, }; struct ftrace_page { @@ -1226,8 +1230,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); - free_ftrace_hash(ops->filter_hash); - free_ftrace_hash(ops->notrace_hash); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); } static struct ftrace_hash *alloc_ftrace_hash(int size_bits) @@ -1382,8 +1386,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -1554,14 +1558,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * gets inversed. */ if (filter_hash) { - hash = ops->filter_hash; - other_hash = ops->notrace_hash; + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; - hash = ops->notrace_hash; - other_hash = ops->filter_hash; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; /* * If the notrace hash has no items, * then there's nothing to do. @@ -2436,8 +2440,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) * Filter_hash being empty will default to trace module. * But notrace hash requires a test of individual module functions. */ - return ftrace_hash_empty(ops->filter_hash) && - ftrace_hash_empty(ops->notrace_hash); + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); } /* @@ -2459,12 +2463,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 0; /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->filter_hash) && - !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) return 0; return 1; @@ -2785,10 +2789,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } else { rec = &iter->pg->records[iter->idx++]; if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -2837,9 +2841,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) * functions are enabled. */ if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) || + ftrace_hash_empty(ops->func_hash->filter_hash)) || (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->notrace_hash))) { + ftrace_hash_empty(ops->func_hash->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -3001,12 +3005,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) - hash = ops->notrace_hash; + hash = ops->func_hash->notrace_hash; else - hash = ops->filter_hash; + hash = ops->func_hash->filter_hash; if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; @@ -3041,7 +3045,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, file->private_data = iter; out_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); return ret; } @@ -3279,7 +3283,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(trace_probe_ops) + INIT_OPS_HASH(trace_probe_ops) }; static int ftrace_probe_registered; @@ -3342,7 +3346,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3359,7 +3363,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { @@ -3428,7 +3432,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, out_unlock: mutex_unlock(&ftrace_lock); out: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); return count; @@ -3446,7 +3450,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3468,7 +3472,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3521,7 +3525,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); } @@ -3717,12 +3721,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (unlikely(ftrace_disabled)) return -ENODEV; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (enable) - orig_hash = &ops->filter_hash; + orig_hash = &ops->func_hash->filter_hash; else - orig_hash = &ops->notrace_hash; + orig_hash = &ops->func_hash->notrace_hash; if (reset) hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); @@ -3752,7 +3756,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(hash); return ret; @@ -3975,15 +3979,15 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); - mutex_lock(&iter->ops->regex_lock); + mutex_lock(&iter->ops->func_hash->regex_lock); if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); if (filter_hash) - orig_hash = &iter->ops->filter_hash; + orig_hash = &iter->ops->func_hash->filter_hash; else - orig_hash = &iter->ops->notrace_hash; + orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, @@ -3994,7 +3998,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_lock); } - mutex_unlock(&iter->ops->regex_lock); + mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); @@ -4611,7 +4615,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + INIT_OPS_HASH(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4713,7 +4717,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(control_ops) + INIT_OPS_HASH(control_ops) }; static inline void -- cgit v1.2.3 From fa8137be6ba632041e725e4623258ba27a2cf9be Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 11:44:03 -0400 Subject: cgroup: Display legacy cgroup files on default hierarchy Kernel command line parameter cgroup__DEVEL__legacy_files_on_dfl forces legacy cgroup files to show up on default hierarhcy if susbsystem does not have any files defined for default hierarchy. But this seems to be working only if legacy files are defined in ss->legacy_cftypes. If one adds some cftypes later using cgroup_add_legacy_cftypes(), these files don't show up on default hierarchy. Update the function accordingly so that the dynamically added legacy files also show up in the default hierarchy if the target subsystem is also using the base legacy files for the default hierarchy. tj: Patch description and comment updates. Signed-off-by: Vivek Goyal Signed-off-by: Tejun Heo --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c3d1802a9b30..50b94113f4f7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3271,8 +3271,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; + /* + * If legacy_flies_on_dfl, we want to show the legacy files on the + * dfl hierarchy but iff the target subsystem hasn't been updated + * for the dfl hierarchy yet. + */ + if (!cgroup_legacy_files_on_dfl || + ss->dfl_cftypes != ss->legacy_cftypes) { + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + } + return cgroup_add_cftypes(ss, cfts); } -- cgit v1.2.3 From 84261912ebee41269004e8a9f3614ba38ef6b206 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 18 Aug 2014 13:21:08 -0400 Subject: ftrace: Update all ftrace_ops for a ftrace_hash_ops update When updating what an ftrace_ops traces, if it is registered (that is, actively tracing), and that ftrace_ops uses the shared global_ops local_hash, then we need to update all tracers that are active and also share the global_ops' ftrace_hash_ops. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c92757adba79..37f9e90d241c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1292,9 +1292,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) } static void -ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void -ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_move(struct ftrace_ops *ops, int enable, @@ -1346,13 +1346,13 @@ update: * Remove the current set, update the hash and add * them back. */ - ftrace_hash_rec_disable(ops, enable); + ftrace_hash_rec_disable_modify(ops, enable); old_hash = *dst; rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - ftrace_hash_rec_enable(ops, enable); + ftrace_hash_rec_enable_modify(ops, enable); return 0; } @@ -1686,6 +1686,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; -- cgit v1.2.3 From bce0b6c51ac76fc0e763262a6c2a9d05e486f0d8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 20 Aug 2014 23:57:04 -0400 Subject: ftrace: Fix up trampoline accounting with looping on hash ops Now that a ftrace_hash can be shared by multiple ftrace_ops, they can dec the rec->flags by more than once (one per those that share the ftrace_hash). This means that the tramp_hash may not have a hash item when it was added. For example, if two ftrace_ops share a hash for a ftrace record, and the first ops has a trampoline, when it adds itself it will set the rec->flags TRAMP flag and increments its nr_trampolines counter. When the second ops is added, it must clear that tramp flag but also decrement the other ops that shares its hash. As the update to the function callbacks has not yet been performed, the other ops will not have the tramp hash set yet and it can not be used to know to decrement its nr_trampolines. Luckily, the tramp_hash does not need to be used. As the ftrace_mutex is held, a ops with a trampoline to a record during an update of another ops that shares the record will have its func_hash pointing to it. Since a trampoline can only be set for a record if only one ops is attached to it, we can just check if the record has a trampoline (the FTRACE_FL_TRAMP flag is set) and then find the ops that has this record in its hashes. Also added some output to help debug when things go wrong. Cc: stable@vger.kernel.org # 3.16+ (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37f9e90d241c..92376aeac4a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1507,25 +1507,38 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static void ftrace_remove_tramp(struct ftrace_ops *ops, struct dyn_ftrace *rec) { - struct ftrace_func_entry *entry; - - entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); - if (!entry) + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) return; + rec->flags &= ~FTRACE_FL_TRAMP; + + if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || + ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + return; /* * The tramp_hash entry will be removed at time * of update. */ ops->nr_trampolines--; - rec->flags &= ~FTRACE_FL_TRAMP; } -static void ftrace_clear_tramps(struct dyn_ftrace *rec) +static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) { struct ftrace_ops *op; + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) + return; + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This function is called to clear other tramps + * not the one that is being updated. + */ + if (op == ops) + continue; if (op->nr_trampolines) ftrace_remove_tramp(op, rec); } while_for_each_ftrace_op(op); @@ -1626,13 +1639,10 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, /* * If we are adding another function callback * to this function, and the previous had a - * trampoline used, then we need to go back to - * the default trampoline. + * custom trampoline in use, then we need to go + * back to the default trampoline. */ - rec->flags &= ~FTRACE_FL_TRAMP; - - /* remove trampolines from any ops for this rec */ - ftrace_clear_tramps(rec); + ftrace_clear_tramps(rec, ops); } /* @@ -1935,8 +1945,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP) { ops = ftrace_find_tramp_ops_new(rec); if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -2266,7 +2276,10 @@ static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) } while_for_each_ftrace_rec(); /* The number of recs in the hash must match nr_trampolines */ - FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) + pr_warn("count=%ld trampolines=%d\n", + ops->tramp_hash->count, + ops->nr_trampolines); return 0; } -- cgit v1.2.3 From 5f151b240192a1557119d5375af71efc26825bc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Aug 2014 17:18:46 -0400 Subject: ftrace: Fix function_profiler and function tracer together The latest rewrite of ftrace removed the separate ftrace_ops of the function tracer and the function graph tracer and had them share the same ftrace_ops. This simplified the accounting by removing the multiple layers of functions called, where the global_ops func would call a special list that would iterate over the other ops that were registered within it (like function and function graph), which itself was registered to the ftrace ops list of all functions currently active. If that sounds confusing, the code that implemented it was also confusing and its removal is a good thing. The problem with this change was that it assumed that the function and function graph tracer can never be used at the same time. This is mostly true, but there is an exception. That is when the function profiler uses the function graph tracer to profile. The function profiler can be activated the same time as the function tracer, and this breaks the assumption and the result is that ftrace will crash (it detects the error and shuts itself down, it does not cause a kernel oops). To solve this issue, a previous change allowed the hash tables for the functions traced by a ftrace_ops to be a pointer and let multiple ftrace_ops share the same hash. This allows the function and function_graph tracer to have separate ftrace_ops, but still share the hash, which is what is done. Now the function and function graph tracers have separate ftrace_ops again, and the function tracer can be run while the function_profile is active. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 60 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92376aeac4a7..08aca65d709a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,8 +68,12 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) +#define ASSIGN_OPS_HASH(opsname, val) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -4663,7 +4667,6 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -5197,6 +5200,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + static int ftrace_graph_active; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -5359,12 +5373,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) */ static void update_function_graph_func(void) { - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list == &global_ops && - global_ops.next == &ftrace_list_end)) - ftrace_graph_entry = __ftrace_graph_entry; - else + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; } static struct notifier_block ftrace_suspend_notifier = { @@ -5405,16 +5435,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - /* Function graph doesn't use the .func field of global_ops */ - global_ops.flags |= FTRACE_OPS_FL_STUB; - -#ifdef CONFIG_DYNAMIC_FTRACE - /* Optimize function graph calling (if implemented by arch) */ - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; -#endif - - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5432,12 +5453,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); - global_ops.flags &= ~FTRACE_OPS_FL_STUB; -#ifdef CONFIG_DYNAMIC_FTRACE - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = 0; -#endif + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.2.3 From 39b5552cd5090d4c210d278cd2732f493075f033 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sun, 17 Aug 2014 20:59:10 -0400 Subject: ftrace: Use current addr when converting to nop in __ftrace_replace_code() In __ftrace_replace_code(), when converting the call to a nop in a function it needs to compare against the "curr" (current) value of the ftrace ops, and not the "new" one. It currently does not affect x86 which is the only arch to do the trampolines with function graph tracer, but when other archs that do depend on this code implement the function graph trampoline, it can crash. Here's an example when ARM uses the trampolines (in the future): ------------[ cut here ]------------ WARNING: CPU: 0 PID: 9 at kernel/trace/ftrace.c:1716 ftrace_bug+0x17c/0x1f4() Modules linked in: omap_rng rng_core ipv6 CPU: 0 PID: 9 Comm: migration/0 Not tainted 3.16.0-test-10959-gf0094b28f303-dirty #52 [] (unwind_backtrace) from [] (show_stack+0x20/0x24) [] (show_stack) from [] (dump_stack+0x78/0x94) [] (dump_stack) from [] (warn_slowpath_common+0x7c/0x9c) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x2c/0x34) [] (warn_slowpath_null) from [] (ftrace_bug+0x17c/0x1f4) [] (ftrace_bug) from [] (ftrace_replace_code+0x80/0x9c) [] (ftrace_replace_code) from [] (ftrace_modify_all_code+0xb8/0x164) [] (ftrace_modify_all_code) from [] (__ftrace_modify_code+0x14/0x1c) [] (__ftrace_modify_code) from [] (multi_cpu_stop+0xf4/0x134) [] (multi_cpu_stop) from [] (cpu_stopper_thread+0x54/0x130) [] (cpu_stopper_thread) from [] (smpboot_thread_fn+0x1ac/0x1bc) [] (smpboot_thread_fn) from [] (kthread+0xe0/0xfc) [] (kthread) from [] (ret_from_fork+0x14/0x20) ---[ end trace dc9ce72c5b617d8f ]--- [ 65.047264] ftrace failed to modify [] asm_do_IRQ+0x10/0x1c [ 65.054070] actual: 85:1b:00:eb Fixes: 7413af1fb70e7 "ftrace: Make get_ftrace_addr() and get_ftrace_addr_old() global" Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08aca65d709a..5916a8e59e87 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2017,7 +2017,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: - return ftrace_make_nop(NULL, rec, ftrace_addr); + return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); -- cgit v1.2.3 From 7cad45eea3849faeb34591b60d16b50d13a38d77 Mon Sep 17 00:00:00 2001 From: Vincent Stehlé Date: Fri, 22 Aug 2014 01:31:20 +0200 Subject: irq: Export handle_fasteoi_irq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export handle_fasteoi_irq to be able to use it in e.g. the Zynq gpio driver since commit 6dd859508336 ("gpio: zynq: Fix IRQ handlers"). This fixes the following link issue: ERROR: "handle_fasteoi_irq" [drivers/gpio/gpio-zynq.ko] undefined! Signed-off-by: Vincent Stehlé Acked-by: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Cc: Vincent Stehle Cc: Lars-Peter Clausen Cc: Linus Walleij Link: http://lkml.kernel.org/r/1408663880-29179-1-git-send-email-vincent.stehle@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..6223fab9a9d2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -517,6 +517,7 @@ out: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler -- cgit v1.2.3 From 4ce97dbf50245227add17c83d87dc838e7ca79d0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Aug 2014 13:59:41 -0400 Subject: trace: Fix epoll hang when we race with new entries Epoll on trace_pipe can sometimes hang in a weird case. If the ring buffer is empty when we set waiters_pending but an event shows up exactly at that moment we can miss being woken up by the ring buffers irq work. Since ring_buffer_empty() is inherently racey we will sometimes think that the buffer is not empty. So we don't get woken up and we don't think there are any events even though there were some ready when we added the watch, which makes us hang. This patch fixes this by making sure that we are actually on the wait list before we set waiters_pending, and add a memory barrier to make sure ring_buffer_empty() is going to be correct. Link: http://lkml.kernel.org/p/1408989581-23727-1-git-send-email-jbacik@fb.com Cc: stable@vger.kernel.org # 3.10+ Cc: Martin Lau Signed-off-by: Josef Bacik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index afb04b9b818a..b38fb2b9e237 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - work->waiters_pending = true; poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) -- cgit v1.2.3 From 11ed7f934cb807f26da09547b5946c2e534d1dac Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 27 Aug 2014 16:43:40 -0400 Subject: rcu: Make nocb leader kthreads process pending callbacks after spawning The nocb callbacks generated before the nocb kthreads are spawned are enqueued in the nocb queue for later processing. Commit fbce7497ee5af ("rcu: Parallelize and economize NOCB kthread wakeups") introduced nocb leader kthreads which checked the nocb_leader_wake flag to see if there were any such pending callbacks. A case was reported in which newly spawned leader kthreads were not processing the pending callbacks as this flag was not set, which led to a boot hang. The following commit ensures that the newly spawned nocb kthreads process the pending callbacks by allowing the kthreads to run immediately after spawning instead of waiting. This is done by inverting the logic of nocb_leader_wake tests to nocb_leader_sleep which allows us to use the default initialization of this flag to 0 to let the kthreads run. Reported-by: Amit Shah Signed-off-by: Pranith Kumar Link: http://www.spinics.net/lists/kernel/msg1802899.html [ paulmck: Backported to v3.17-rc2. ] Signed-off-by: Paul E. McKenney Tested-by: Amit Shah --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71e64c718f75..6a86eb7bac45 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -358,7 +358,7 @@ struct rcu_data { struct rcu_head **nocb_gp_tail; long nocb_gp_count; long nocb_gp_count_lazy; - bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 00dc411e9676..a7997e272564 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2074,9 +2074,9 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) return; - if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior xchg orders against prior callback enqueue. */ - ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; wake_up(&rdp_leader->nocb_wq); } } @@ -2253,7 +2253,7 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); wait_event_interruptible(my_rdp->nocb_wq, - ACCESS_ONCE(my_rdp->nocb_leader_wake)); + !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ @@ -2292,12 +2292,12 @@ wait_again: schedule_timeout_interruptible(1); /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) if (ACCESS_ONCE(rdp->nocb_head)) { /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_wake = true; + my_rdp->nocb_leader_sleep = false; break; } goto wait_again; @@ -2307,17 +2307,17 @@ wait_again: rcu_nocb_wait_gp(my_rdp); /* - * We left ->nocb_leader_wake set to reduce cache thrashing. - * We clear it now, but recheck for new callbacks while + * We left ->nocb_leader_sleep unset to reduce cache thrashing. + * We set it now, but recheck for new callbacks while * traversing our follower list. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { if (ACCESS_ONCE(rdp->nocb_head)) - my_rdp->nocb_leader_wake = true; /* No need to wait. */ + my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ -- cgit v1.2.3 From 800df627e2eabaf4a921d342a1d5162c843b7fc2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:29 -0700 Subject: resource: fix the case of null pointer access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard and Daniel reported that UML is broken due to changes to resource traversal functions. Problem is that iomem_resource.child can be null and new code does not consider that possibility. Old code used a for loop and that loop will not even execute if p was null. Revert back to for() loop logic and bail out if p is null. I also moved sibling_only check out of resource_lock. There is no reason to keep it inside the lock. Following is backtrace of the UML crash. RIP: 0033:[<0000000060039b9f>] RSP: 0000000081459da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000219b3fff RCX: 000000006010d1d9 RDX: 0000000000000001 RSI: 00000000602dfb94 RDI: 0000000081459df8 RBP: 0000000081459de0 R08: 00000000601b59f4 R09: ffffffff0000ff00 R10: ffffffff0000ff00 R11: 0000000081459e88 R12: 0000000081459df8 R13: 00000000219b3fff R14: 00000000602dfb94 R15: 0000000000000000 Kernel panic - not syncing: Segfault with no mm CPU: 0 PID: 1 Comm: swapper Not tainted 3.16.0-10454-g58d08e3 #13 Stack: 00000000 000080d0 81459df0 219b3fff 81459e70 6010d1d9 ffffffff 6033e010 81459e50 6003a269 81459e30 00000000 Call Trace: [<6010d1d9>] ? kclist_add_private+0x0/0xe7 [<6003a269>] walk_system_ram_range+0x61/0xb7 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6010d574>] kcore_update_ram+0x4c/0x168 [<6010d72e>] ? kclist_add+0x0/0x2e [<6000e943>] proc_kcore_init+0xea/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<600189f0>] do_one_initcall+0x13c/0x204 [<6004ca46>] ? parse_args+0x1df/0x2e0 [<6004c82d>] ? parameq+0x0/0x3a [<601b5990>] ? strcpy+0x0/0x18 [<60001e1a>] kernel_init_freeable+0x240/0x31e [<6026f1c0>] kernel_init+0x12/0x148 [<60019fad>] new_thread_handler+0x81/0xa3 Fixes 8c86e70acead629aacb4a ("resource: provide new functions to walk through resources"). Reported-by: Daniel Walter Tested-by: Richard Weinberger Tested-by: Toralf Förster Tested-by: Daniel Walter Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index da14b8d09296..60c5a3856ab7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -351,15 +351,12 @@ static int find_next_iomem_res(struct resource *res, char *name, end = res->end; BUG_ON(start >= end); - read_lock(&resource_lock); - - if (first_level_children_only) { - p = iomem_resource.child; + if (first_level_children_only) sibling_only = true; - } else - p = &iomem_resource; - while ((p = next_resource(p, sibling_only))) { + read_lock(&resource_lock); + + for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) -- cgit v1.2.3 From 74ca317c26a3f8543203b61d262c0ab2e30c384e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:46 -0700 Subject: kexec: create a new config option CONFIG_KEXEC_FILE for new syscall Currently new system call kexec_file_load() and all the associated code compiles if CONFIG_KEXEC=y. But new syscall also compiles purgatory code which currently uses gcc option -mcmodel=large. This option seems to be available only gcc 4.4 onwards. Hiding new functionality behind a new config option will not break existing users of old gcc. Those who wish to enable new functionality will require new gcc. Having said that, I am trying to figure out how can I move away from using -mcmodel=large but that can take a while. I think there are other advantages of introducing this new config option. As this option will be enabled only on x86_64, other arches don't have to compile generic kexec code which will never be used. This new code selects CRYPTO=y and CRYPTO_SHA256=y. And all other arches had to do this for CONFIG_KEXEC. Now with introduction of new config option, we can remove crypto dependency from other arches. Now CONFIG_KEXEC_FILE is available only on x86_64. So whereever I had CONFIG_X86_64 defined, I got rid of that. For CONFIG_KEXEC_FILE, instead of doing select CRYPTO=y, I changed it to "depends on CRYPTO=y". This should be safer as "select" is not recursive. Signed-off-by: Vivek Goyal Cc: Eric Biederman Cc: H. Peter Anvin Tested-by: Shaun Ruffell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kbuild | 4 +--- arch/x86/Kconfig | 18 ++++++++++++++---- arch/x86/Makefile | 5 +---- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/crash.c | 6 ++---- arch/x86/kernel/machine_kexec_64.c | 11 +++++++++++ arch/x86/purgatory/Makefile | 5 +---- kernel/kexec.c | 11 +++++++++++ 8 files changed, 42 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 61b6d51866f8..3942f74c92d7 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -17,6 +17,4 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ -ifeq ($(CONFIG_X86_64),y) -obj-$(CONFIG_KEXEC) += purgatory/ -endif +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5d0bf1aa9dcb..778178f4c7d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1585,9 +1585,6 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" - select BUILD_BIN2C - select CRYPTO - select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1602,9 +1599,22 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_FILE + bool "kexec file based system call" + select BUILD_BIN2C + depends on KEXEC + depends on X86_64 + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + ---help--- + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by previous system call. + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC + depends on KEXEC_FILE ---help--- This option makes kernel signature verification mandatory for kexec_file_load() syscall. If kernel is signature can not be diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c1aa36887843..c96bcec544fc 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -184,11 +184,8 @@ archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all archprepare: -ifeq ($(CONFIG_KEXEC),y) -# Build only for 64bit. No loaders for 32bit yet. - ifeq ($(CONFIG_X86_64),y) +ifeq ($(CONFIG_KEXEC_FILE),y) $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c - endif endif ### diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b5ea75c4a4b4..ada2e2d6be3e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o @@ -118,5 +119,4 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0553a34fa0df..a618fcd2c07d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -182,8 +182,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, safe_smp_processor_id()); } -#ifdef CONFIG_X86_64 - +#ifdef CONFIG_KEXEC_FILE static int get_nr_ram_ranges_callback(unsigned long start_pfn, unsigned long nr_pfn, void *arg) { @@ -696,5 +695,4 @@ int crash_load_segments(struct kimage *image) return ret; } - -#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8b04018e5d1f..485981059a40 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -25,9 +25,11 @@ #include #include +#ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { &kexec_bzImage64_ops, }; +#endif static void free_transition_pgtable(struct kimage *image) { @@ -178,6 +180,7 @@ static void load_segments(void) ); } +#ifdef CONFIG_KEXEC_FILE /* Update purgatory as needed after various image segments have been prepared */ static int arch_update_purgatory(struct kimage *image) { @@ -209,6 +212,12 @@ static int arch_update_purgatory(struct kimage *image) return ret; } +#else /* !CONFIG_KEXEC_FILE */ +static inline int arch_update_purgatory(struct kimage *image) +{ + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ int machine_kexec_prepare(struct kimage *image) { @@ -329,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ +#ifdef CONFIG_KEXEC_FILE int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { @@ -522,3 +532,4 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 7fde9ee438a4..c4ae06e4ae74 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -24,7 +24,4 @@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) -# No loaders for 32bits yet. -ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_KEXEC) += kexec-purgatory.o -endif +obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/kernel/kexec.c b/kernel/kexec.c index 0b49a0a58102..2bee072268d9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -64,7 +64,9 @@ bool kexec_in_progress = false; char __weak kexec_purgatory[0]; size_t __weak kexec_purgatory_size = 0; +#ifdef CONFIG_KEXEC_FILE static int kexec_calculate_store_digests(struct kimage *image); +#endif /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { @@ -341,6 +343,7 @@ out_free_image: return ret; } +#ifdef CONFIG_KEXEC_FILE static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { struct fd f = fdget(fd); @@ -612,6 +615,9 @@ out_free_image: kfree(image); return ret; } +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ static int kimage_is_destination_range(struct kimage *image, unsigned long start, @@ -1375,6 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +#ifdef CONFIG_KEXEC_FILE SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) @@ -1451,6 +1458,8 @@ out: return ret; } +#endif /* CONFIG_KEXEC_FILE */ + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -2006,6 +2015,7 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +#ifdef CONFIG_KEXEC_FILE static int __kexec_add_segment(struct kimage *image, char *buf, unsigned long bufsz, unsigned long mem, unsigned long memsz) @@ -2682,6 +2692,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_KEXEC_FILE */ /* * Move into place and start executing a preloaded standalone -- cgit v1.2.3 From 62109b43176b87e78b2b6d91bcfe16128c30229b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Sep 2014 01:21:03 +0200 Subject: PM / sleep: Fix test_suspend= command line option After commit d431cbc53cb7 (PM / sleep: Simplify sleep states sysfs interface code) the pm_states[] array is not populated initially, which causes setup_test_suspend() to always fail and the suspend testing during boot doesn't work any more. Fix the problem by using pm_labels[] instead of pm_states[] in setup_test_suspend() and storing a pointer to the label of the sleep state to test rather than the number representing it, because the connection between the state numbers and labels is only established by suspend_set_ops(). Fixes: d431cbc53cb7 (PM / sleep: Simplify sleep states sysfs interface code) Reported-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 1 + kernel/power/suspend.c | 2 +- kernel/power/suspend_test.c | 31 +++++++++++++++++++------------ 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 5d49dcac2537..2df883a9d3cb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ +extern const char *pm_labels[]; extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6dadb25cb0d8..18c62195660f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,7 +31,7 @@ #include "power.h" -static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 2f524928b6aa..bd91bc177c93 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,20 +129,20 @@ static int __init has_wakealarm(struct device *dev, const void *data) * at startup time. They're normally disabled, for faster boot and because * we can't know which states really work on this particular system. */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; +static const char *test_state_label __initdata; static char warn_bad_state[] __initdata = KERN_WARNING "PM: can't test '%s' suspend state\n"; static int __init setup_test_suspend(char *value) { - suspend_state_t i; + int i; /* "=mem" ==> "mem" */ value++; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i], value)) { - test_state = i; + for (i = 0; pm_labels[i]; i++) + if (!strcmp(pm_labels[i], value)) { + test_state_label = pm_labels[i]; return 0; } @@ -158,13 +158,21 @@ static int __init test_suspend(void) struct rtc_device *rtc = NULL; struct device *dev; + suspend_state_t test_state; /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!pm_states[test_state]) { - printk(warn_bad_state, pm_states[test_state]); - goto done; + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; } /* RTCs have initialized by now too ... can we use one? */ @@ -173,13 +181,12 @@ static int __init test_suspend(void) rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); - goto done; + return 0; } /* go for it */ test_wakealarm(rtc, test_state); rtc_class_close(rtc); -done: return 0; } late_initcall(test_suspend); -- cgit v1.2.3 From a4189487da1b4f8260c6006b9dc47c3c4107a5ae Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 4 Sep 2014 14:43:07 +0800 Subject: cgroup: delay the clearing of cgrp->kn->priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run these two scripts concurrently: for ((; ;)) { mkdir /cgroup/sub rmdir /cgroup/sub } for ((; ;)) { echo $$ > /cgroup/sub/cgroup.procs echo $$ > /cgroup/cgroup.procs } A kernel bug will be triggered: BUG: unable to handle kernel NULL pointer dereference at 00000038 IP: [] cgroup_put+0x9/0x80 ... Call Trace: [] cgroup_kn_unlock+0x39/0x50 [] cgroup_kn_lock_live+0x61/0x70 [] __cgroup_procs_write.isra.26+0x51/0x230 [] cgroup_tasks_write+0x12/0x20 [] cgroup_file_write+0x40/0x130 [] kernfs_fop_write+0xd1/0x160 [] vfs_write+0x98/0x1e0 [] SyS_write+0x4d/0xa0 [] sysenter_do_call+0x12/0x12 We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another concurrent thread can access kn->priv after the clearing. We should move the clearing to css_release_work_fn(). At that time no one is holding reference to the cgroup and no one can gain a new reference to access it. v2: - move RCU_INIT_POINTER() into the else block. (Tejun) - remove the cgroup_parent() check. (Tejun) - update the comment in css_tryget_online_from_dir(). Cc: # 3.15+ Reported-by: Toralf Förster Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 50b94113f4f7..bf30076664ca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4396,6 +4396,15 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; + + /* + * There are two control paths which try to determine + * cgroup from dentry without going through kernfs - + * cgroupstats_build() and css_tryget_online_from_dir(). + * Those are supported by RCU protecting clearing of + * cgrp->kn->priv backpointer. + */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } mutex_unlock(&cgroup_mutex); @@ -4834,16 +4843,6 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgroup_kn_unlock(kn); - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ - if (!ret) - RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - cgroup_put(cgrp); return ret; } @@ -5430,7 +5429,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See cgroup_rmdir() for details. + * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) -- cgit v1.2.3 From aa32362f011c6e863132b16c1761487166a4bad2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 4 Sep 2014 14:43:38 +0800 Subject: cgroup: check cgroup liveliness before unbreaking kernfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cgroup_kn_lock_live() is called through some kernfs operation and another thread is calling cgroup_rmdir(), we'll trigger the warning in cgroup_get(). ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0() ... Call Trace: [] dump_stack+0x41/0x52 [] warn_slowpath_common+0x7f/0xa0 [] warn_slowpath_null+0x1d/0x20 [] cgroup_get+0x89/0xa0 [] cgroup_kn_lock_live+0x28/0x70 [] __cgroup_procs_write.isra.26+0x51/0x230 [] cgroup_tasks_write+0x12/0x20 [] cgroup_file_write+0x40/0x130 [] kernfs_fop_write+0xd1/0x160 [] vfs_write+0x98/0x1e0 [] SyS_write+0x4d/0xa0 [] sysenter_do_call+0x12/0x12 ---[ end trace 6f2e0c38c2108a74 ]--- Fix this by calling css_tryget() instead of cgroup_get(). v2: - move cgroup_tryget() right below cgroup_get() definition. (Tejun) Cc: # 3.15+ Reported-by: Toralf Förster Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bf30076664ca..940aced4ed00 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,6 +1035,11 @@ static void cgroup_get(struct cgroup *cgrp) css_get(&cgrp->self); } +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + static void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); @@ -1147,7 +1152,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ - cgroup_get(cgrp); + if (!cgroup_tryget(cgrp)) + return NULL; kernfs_break_active_protection(kn); mutex_lock(&cgroup_mutex); -- cgit v1.2.3 From 40bea039593dfc7f3f9814dab844f6db43ae580b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Aug 2014 18:50:16 +0200 Subject: nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_null+0x1a/0x20 [] irq_work_queue_on+0x11e/0x140 [] tick_nohz_full_kick_cpu+0x57/0x90 [] __perf_event_overflow+0x275/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] ? arch_vtime_task_switch+0x63/0x130 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? lock_release+0xab/0x330 [] default_do_nmi+0x72/0x1c0 [] ? cpuacct_account_field+0xcf/0x200 [] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob Reported-and-tested-by: Dave Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 7 +------ kernel/time/tick-sched.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 059052306831..9a82c7dc3fdd 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -183,13 +183,8 @@ static inline bool tick_nohz_full_cpu(int cpu) extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); +extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); - -static inline void tick_nohz_full_kick(void) -{ - tick_nohz_full_kick_cpu(smp_processor_id()); -} - extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..f654a8a298fa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -224,6 +224,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_work_func, }; +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. -- cgit v1.2.3 From 849151dd5481bc8acb1d287a299b5d6a4ca9f1c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:18:07 +0200 Subject: compat: nanosleep: Clarify error handling The error handling in compat_sys_nanosleep() is correct, but completely non obvious. Document it and restrict it to the -ERESTART_RESTARTBLOCK return value for clarity. Reported-by: Kees Cook Signed-off-by: Thomas Gleixner --- kernel/compat.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 633394f442f8..ebb3c369d03d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); - if (ret) { + if (ret == -ERESTART_RESTARTBLOCK) { rmtp = restart->nanosleep.compat_rmtp; if (rmtp && compat_put_timespec(&rmt, rmtp)) @@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); - if (ret) { + /* + * hrtimer_nanosleep() can only return 0 or + * -ERESTART_RESTARTBLOCK here because: + * + * - we call it with HRTIMER_MODE_REL and therefor exclude the + * -ERESTARTNOHAND return path. + * + * - we supply the rmtp argument from the task stack (due to + * the necessary compat conversion. So the update cannot + * fail, which excludes the -EFAULT return path as well. If + * it fails nevertheless we have a bigger problem and wont + * reach this place anymore. + * + * - if the return value is 0, we do not have to update rmtp + * because there is no remaining time. + * + * We check for -ERESTART_RESTARTBLOCK nevertheless if the + * core implementation decides to return random nonsense. + */ + if (ret == -ERESTART_RESTARTBLOCK) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } - return ret; } -- cgit v1.2.3 From 9bf2419fa7bffa16ce58a4d5c20399eff8c970c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:24:49 +0200 Subject: timekeeping: Update timekeeper before updating vsyscall and pvclock The update_walltime() code works on the shadow timekeeper to make the seqcount protected region as short as possible. But that update to the shadow timekeeper does not update all timekeeper fields because it's sufficient to do that once before it becomes life. One of these fields is tkr.base_mono. That stays stale in the shadow timekeeper unless an operation happens which copies the real timekeeper to the shadow. The update function is called after the update calls to vsyscall and pvclock. While not correct, it did not cause any problems because none of the invoked update functions used base_mono. commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based) changed that in the kvm pvclock update function, so the stale mono_base value got used and caused kvm-clock to malfunction. Put the update where it belongs and fix the issue. Reported-by: Chris J Arges Reported-by: Paolo Bonzini Cc: Gleb Natapov Cc: John Stultz Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1409050000570.3333@nanos Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb4a9c2cf8d9..ec1791fae965 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -442,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) tk->ntp_error = 0; ntp_clear(); } - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk_update_ktime_data(tk); + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); -- cgit v1.2.3