From 9a0f05cb36888550d1509d60aa55788615abea44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 15:13:29 +0100 Subject: perf: Update the mmap control page on mmap() Apparently we didn't update the mmap control page right after mmap(), which leads to surprises when userspace wants to use it. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-dcpi7164djsexmx6ya7lilrc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f8f3f103cb4..0ca1f648ac08 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3534,6 +3534,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) event->mmap_user = get_current_user(); vma->vm_mm->pinned_vm += event->mmap_locked; + perf_event_update_userpage(event); + unlock: if (!ret) atomic_inc(&event->mmap_count); -- cgit v1.2.3 From 35edc2a5095efb189e60dc32bbb9d2663aec6d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:36:02 +0100 Subject: perf, arch: Rework perf_event_index() Put the logic to compute the event index into a per pmu method. This is required because the x86 rules are weird and wonderful and don't match the capabilities of the current scheme. AFAIK only powerpc actually has a usable userspace read of the PMCs but I'm not at all sure anybody actually used that. ARM is restored to the default since it currently does not support userspace access at all. And all software events are provided with a method that reports their index as 0 (disabled). Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Richard Kuo Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-dfydxodki16lylkt3gl2j7cw@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/perf_event.h | 4 ---- arch/frv/include/asm/perf_event.h | 2 -- arch/hexagon/include/asm/perf_event.h | 2 -- arch/powerpc/include/asm/perf_event_server.h | 2 -- arch/powerpc/kernel/perf_event.c | 6 ++++++ arch/s390/include/asm/perf_event.h | 1 - arch/x86/include/asm/perf_event.h | 2 -- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 27 ++++++++++++++++++++++----- kernel/events/hw_breakpoint.c | 7 +++++++ 10 files changed, 41 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h index 0f8e3827a89b..08f94d8fc04c 100644 --- a/arch/arm/include/asm/perf_event.h +++ b/arch/arm/include/asm/perf_event.h @@ -12,10 +12,6 @@ #ifndef __ARM_PERF_EVENT_H__ #define __ARM_PERF_EVENT_H__ -/* ARM performance counters start from 1 (in the cp15 accesses) so use the - * same indexes here for consistency. */ -#define PERF_EVENT_INDEX_OFFSET 1 - /* ARM perf PMU IDs for use by internal perf clients. */ enum arm_perf_pmu_ids { ARM_PERF_PMU_ID_XSCALE1 = 0, diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h index a69e0155d146..c52ea5546b5b 100644 --- a/arch/frv/include/asm/perf_event.h +++ b/arch/frv/include/asm/perf_event.h @@ -12,6 +12,4 @@ #ifndef _ASM_PERF_EVENT_H #define _ASM_PERF_EVENT_H -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/hexagon/include/asm/perf_event.h b/arch/hexagon/include/asm/perf_event.h index 6c2910f91180..8b8526b491c7 100644 --- a/arch/hexagon/include/asm/perf_event.h +++ b/arch/hexagon/include/asm/perf_event.h @@ -19,6 +19,4 @@ #ifndef _ASM_PERF_EVENT_H #define _ASM_PERF_EVENT_H -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 8f1df1208d23..1a8093fa8f71 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -61,8 +61,6 @@ struct pt_regs; extern unsigned long perf_misc_flags(struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct pt_regs *regs); -#define PERF_EVENT_INDEX_OFFSET 1 - /* * Only override the default definitions in include/linux/perf_event.h * if we have hardware PMU support. diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 10a140f82cb8..d614ab57ccca 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1187,6 +1187,11 @@ static int power_pmu_event_init(struct perf_event *event) return err; } +static int power_pmu_event_idx(struct perf_event *event) +{ + return event->hw.idx; +} + struct pmu power_pmu = { .pmu_enable = power_pmu_enable, .pmu_disable = power_pmu_disable, @@ -1199,6 +1204,7 @@ struct pmu power_pmu = { .start_txn = power_pmu_start_txn, .cancel_txn = power_pmu_cancel_txn, .commit_txn = power_pmu_commit_txn, + .event_idx = power_pmu_event_idx, }; /* diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index a75f168d2718..4eb444edbe49 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -6,4 +6,3 @@ /* Empty, just to avoid compiling error */ -#define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..9b922c136254 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,8 +188,6 @@ extern u32 get_ibs_caps(void); #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); -#define PERF_EVENT_INDEX_OFFSET 0 - /* * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. * This flag is otherwise unused and ABI specified to be 0, so nobody should diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 08855613ceb3..02545e6df95b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -680,6 +680,12 @@ struct pmu { * for each successful ->add() during the transaction. */ void (*cancel_txn) (struct pmu *pmu); /* optional */ + + /* + * Will return the value for perf_event_mmap_page::index for this event, + * if no implementation is provided it will default to: event->hw.idx + 1. + */ + int (*event_idx) (struct perf_event *event); /*optional */ }; /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 0ca1f648ac08..3894309c41a2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3208,10 +3208,6 @@ int perf_event_task_disable(void) return 0; } -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) @@ -3220,7 +3216,7 @@ static int perf_event_index(struct perf_event *event) if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; + return event->pmu->event_idx(event); } static void calc_timer_values(struct perf_event *event, @@ -4992,6 +4988,11 @@ static int perf_swevent_init(struct perf_event *event) return 0; } +static int perf_swevent_event_idx(struct perf_event *event) +{ + return 0; +} + static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -5001,6 +5002,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -5087,6 +5090,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -5306,6 +5311,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; /* @@ -5378,6 +5385,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -5405,6 +5414,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) perf_pmu_enable(pmu); } +static int perf_event_idx_default(struct perf_event *event) +{ + return event->hw.idx + 1; +} + /* * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. @@ -5594,6 +5608,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->event_idx) + pmu->event_idx = perf_event_idx_default; + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..b0309f76d777 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -613,6 +613,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } +static int hw_breakpoint_event_idx(struct perf_event *bp) +{ + return 0; +} + static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -622,6 +627,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) -- cgit v1.2.3 From 365a4038486b57bb2bd516706a80f82f250f5306 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 20:58:59 +0100 Subject: perf: Fix mmap_page::offset computation There's multiple reason the counter might be unavailable, change the condition to !->index since perf_event_index() should return 0 for all those cases. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-1ixr3olci40w8rgv2evv2ldh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3894309c41a2..05affc3878ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3268,7 +3268,7 @@ void perf_event_update_userpage(struct perf_event *event) barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) + if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + -- cgit v1.2.3 From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 23:30:47 +0100 Subject: perf, x86: Provide means for disabling userspace RDPMC Allow the disabling of RDPMC via a pmu specific attribute: echo 0 > /sys/bus/event_source/devices/cpu/rdpmc Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 55 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 8 ++++++ include/linux/perf_event.h | 1 + kernel/events/core.c | 1 + 4 files changed, 64 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 53b569910175..116b040a73a8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1210,7 +1211,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - set_in_cr4(X86_CR4_PCE); + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1320,6 +1322,8 @@ static int __init init_hw_perf_events(void) } } + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1555,10 +1559,59 @@ static int x86_pmu_event_idx(struct perf_event *event) return idx + 1; } +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul(buf, NULL, 0); + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + smp_call_function(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + NULL, +}; + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, + .attr_groups = x86_pmu_attr_groups, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..513d617b93c4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -307,6 +307,14 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; + /* + * sysfs attrs + */ + int attr_rdpmc; + + /* + * CPU Hotplug hooks + */ int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 02545e6df95b..5311b79fe62c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -615,6 +615,7 @@ struct pmu { struct list_head entry; struct device *dev; + const struct attribute_group **attr_groups; char *name; int type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 05affc3878ff..dcd4049e92fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5505,6 +5505,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) -- cgit v1.2.3 From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 11:43:53 +0100 Subject: perf: Extend the mmap control page with time (TSC) fields Extend the mmap control page with fields so that userspace can compute time deltas relative to the provided time fields. Currently only implemented for x86 with constant and nonstop TSC. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++++ include/linux/perf_event.h | 4 +++- kernel/events/core.c | 21 ++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 116b040a73a8..f8bddb5b0600 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "perf_event.h" @@ -1627,6 +1628,19 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, }; +void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return; + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + return; + + userpg->time_mult = this_cpu_read(cyc2ns); + userpg->time_shift = CYC2NS_SCALE_FACTOR; + userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +} + /* * callchain support */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5311b79fe62c..0b91db2522cc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -291,12 +291,14 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ + __u32 time_mult, time_shift; + __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[123]; /* align to 1k */ + __u64 __reserved[121]; /* align to 1k */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dcd4049e92fc..3a9c7d81afbf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3220,17 +3220,22 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3240,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3252,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3277,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3763,7 +3770,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3776,7 +3783,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v1.2.3 From 6c303d3ab39f0dc69546f179c424ee1124f50906 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Nov 2011 21:13:48 +0100 Subject: tracing: let trace_signal_generate() report more info, kill overflow_fail/lose_info __send_signal()->trace_signal_generate() doesn't report enough info. The users want to know was the signal actually delivered or not, and they also need the shared/private info. The patch moves trace_signal_generate() at the end of __send_signal() and adds the 2 additional arguments. This also allows us to kill trace_signal_overflow_fail/lose_info, we can simply add the appropriate TRACE_SIGNAL_ "result" codes. Reported-by: Seiji Aguchi Reviewed-by: Seiji Aguchi Acked-by: Steven Rostedt Signed-off-by: Oleg Nesterov --- include/trace/events/signal.h | 85 +++++++++++-------------------------------- kernel/signal.c | 22 +++++++---- 2 files changed, 36 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h index 17df43464df0..39a8a430d90f 100644 --- a/include/trace/events/signal.h +++ b/include/trace/events/signal.h @@ -23,11 +23,23 @@ } \ } while (0) +#ifndef TRACE_HEADER_MULTI_READ +enum { + TRACE_SIGNAL_DELIVERED, + TRACE_SIGNAL_IGNORED, + TRACE_SIGNAL_ALREADY_PENDING, + TRACE_SIGNAL_OVERFLOW_FAIL, + TRACE_SIGNAL_LOSE_INFO, +}; +#endif + /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct + * @group: shared or private + * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, @@ -37,9 +49,10 @@ */ TRACE_EVENT(signal_generate, - TP_PROTO(int sig, struct siginfo *info, struct task_struct *task), + TP_PROTO(int sig, struct siginfo *info, struct task_struct *task, + int group, int result), - TP_ARGS(sig, info, task), + TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) @@ -47,6 +60,8 @@ TRACE_EVENT(signal_generate, __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) + __field( int, group ) + __field( int, result ) ), TP_fast_assign( @@ -54,11 +69,14 @@ TRACE_EVENT(signal_generate, TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; + __entry->group = group; + __entry->result = result; ), - TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d", + TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, - __entry->comm, __entry->pid) + __entry->comm, __entry->pid, __entry->group, + __entry->result) ); /** @@ -101,65 +119,6 @@ TRACE_EVENT(signal_deliver, __entry->sa_handler, __entry->sa_flags) ); -DECLARE_EVENT_CLASS(signal_queue_overflow, - - TP_PROTO(int sig, int group, struct siginfo *info), - - TP_ARGS(sig, group, info), - - TP_STRUCT__entry( - __field( int, sig ) - __field( int, group ) - __field( int, errno ) - __field( int, code ) - ), - - TP_fast_assign( - __entry->sig = sig; - __entry->group = group; - TP_STORE_SIGINFO(__entry, info); - ), - - TP_printk("sig=%d group=%d errno=%d code=%d", - __entry->sig, __entry->group, __entry->errno, __entry->code) -); - -/** - * signal_overflow_fail - called when signal queue is overflow - * @sig: signal number - * @group: signal to process group or not (bool) - * @info: pointer to struct siginfo - * - * Kernel fails to generate 'sig' signal with 'info' siginfo, because - * siginfo queue is overflow, and the signal is dropped. - * 'group' is not 0 if the signal will be sent to a process group. - * 'sig' is always one of RT signals. - */ -DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail, - - TP_PROTO(int sig, int group, struct siginfo *info), - - TP_ARGS(sig, group, info) -); - -/** - * signal_lose_info - called when siginfo is lost - * @sig: signal number - * @group: signal to process group or not (bool) - * @info: pointer to struct siginfo - * - * Kernel generates 'sig' signal but loses 'info' siginfo, because siginfo - * queue is overflow. - * 'group' is not 0 if the signal will be sent to a process group. - * 'sig' is always one of non-RT signals. - */ -DEFINE_EVENT(signal_queue_overflow, signal_lose_info, - - TP_PROTO(int sig, int group, struct siginfo *info), - - TP_ARGS(sig, group, info) -); - #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ diff --git a/kernel/signal.c b/kernel/signal.c index c73c4284160e..1bd9e86fda1f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1054,13 +1054,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; int override_rlimit; - - trace_signal_generate(sig, info, t); + int ret = 0, result; assert_spin_locked(&t->sighand->siglock); + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, from_ancestor_ns)) - return 0; + goto ret; pending = group ? &t->signal->shared_pending : &t->pending; /* @@ -1068,8 +1068,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ + result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) - return 0; + goto ret; + + result = TRACE_SIGNAL_DELIVERED; /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. @@ -1127,14 +1130,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * signal was rt and sent by user using something * other than kill(). */ - trace_signal_overflow_fail(sig, group, info); - return -EAGAIN; + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ - trace_signal_lose_info(sig, group, info); + result = TRACE_SIGNAL_LOSE_INFO; } } @@ -1142,7 +1146,9 @@ out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); - return 0; +ret: + trace_signal_generate(sig, info, t, group, result); + return ret; } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, -- cgit v1.2.3 From 163566f60bfe6a8176650155e2d98649b0dfabf8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Nov 2011 21:37:41 +0100 Subject: tracing: send_sigqueue() needs trace_signal_generate() too Add trace_signal_generate() into send_sigqueue(). send_sigqueue() is very similar to __send_signal(), just it uses the preallocated info. It should do the same wrt tracing. Reported-by: Seiji Aguchi Reviewed-by: Seiji Aguchi Acked-by: Steven Rostedt Signed-off-by: Oleg Nesterov --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1bd9e86fda1f..8511e39813c7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1591,7 +1591,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) int sig = q->info.si_signo; struct sigpending *pending; unsigned long flags; - int ret; + int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1600,6 +1600,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) goto ret; ret = 1; /* the signal is ignored */ + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, 0)) goto out; @@ -1611,6 +1612,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) */ BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; + result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } q->info.si_overrun = 0; @@ -1620,7 +1622,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); + result = TRACE_SIGNAL_DELIVERED; out: + trace_signal_generate(sig, &q->info, t, group, result); unlock_task_sighand(t, &flags); ret: return ret; -- cgit v1.2.3 From ac483c446b67870444c9eeaf8325d3d2af9b91bc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Jan 2012 10:04:14 +0100 Subject: ftrace: Change filter/notrace set functions to return exit code Currently the ftrace_set_filter and ftrace_set_notrace functions do not return any return code. So there's no way for ftrace_ops user to tell wether the filter was correctly applied. The set_ftrace_filter interface returns error in case the filter did not match: # echo krava > set_ftrace_filter bash: echo: write error: Invalid argument Changing both ftrace_set_filter and ftrace_set_notrace functions to return zero if the filter was applied correctly or -E* values in case of error. Link: http://lkml.kernel.org/r/1325495060-6402-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 ++-- kernel/trace/ftrace.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 028e26f0bf08..f33fb3b041c6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -178,9 +178,9 @@ struct dyn_ftrace { }; int ftrace_force_update(void); -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 683d559a0eef..e2e0597c0845 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3146,8 +3146,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); - if (buf) - ftrace_match_records(hash, buf, len); + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3157,6 +3159,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); + out_regex_unlock: mutex_unlock(&ftrace_regex_lock); free_ftrace_hash(hash); @@ -3173,10 +3176,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 1); + return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3191,10 +3194,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 0); + return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** -- cgit v1.2.3 From f069686e4bdc60a637d210ea3eea25fcdb82df88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Jan 2012 20:18:55 -0500 Subject: tracing/softirq: Move __raise_softirq_irqoff() out of header The __raise_softirq_irqoff() contains a tracepoint. As tracepoints in headers can cause issues, and not to mention, bloats the kernel when they are in a static inline, it is best to move the function that contains the tracepoint out of the header and into softirq.c. Link: http://lkml.kernel.org/r/20120118120711.GB14863@elte.hu Suggested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/interrupt.h | 7 +------ kernel/irq/chip.c | 2 ++ kernel/softirq.c | 6 ++++++ 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a64b00e286f5..3f830e005118 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -20,7 +20,6 @@ #include #include #include -#include /* * These correspond to the IORESOURCE_IRQ_* defines in @@ -456,11 +455,7 @@ asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -static inline void __raise_softirq_irqoff(unsigned int nr) -{ - trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); -} +extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..fc418249f01f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -16,6 +16,8 @@ #include #include +#include + #include "internals.h" /** diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..06d40993594a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -385,6 +385,12 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } +void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); +} + void open_softirq(int nr, void (*action)(struct softirq_action *)) { softirq_vec[nr].action = action; -- cgit v1.2.3 From 86f5e6a7b192721995ece919985ac75222402351 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vázquez Cao Date: Thu, 9 Feb 2012 17:42:22 -0500 Subject: watchdog: Fix code/comments mismatches Reflect the change in the soft and hard lockup thresholds and their relation to the frequency of the hrtimer and NMI events in the code comments. While at it, remove references to files that do not exist anymore. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1328827342-6253-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d117262deba3..14bc092fb12c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -3,12 +3,9 @@ * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ @@ -117,9 +114,10 @@ static unsigned long get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns - * the divide by 5 is to give hrtimer 5 chances to - * increment before the hardlockup detector generates - * a warning + * the divide by 5 is to give hrtimer several chances (two + * or three with the current relation between the soft + * and hard thresholds) to increment before the + * hardlockup detector generates a warning */ return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } @@ -336,9 +334,11 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in watchdog_timer_fn(). + * Run briefly (kicked by the hrtimer callback function) once every + * get_sample_period() seconds (4 seconds by default) to reset the + * softlockup timestamp. If this gets delayed for more than + * 2*watchdog_thresh seconds then the debug-printout triggers in + * watchdog_timer_fn(). */ while (!kthread_should_stop()) { __touch_watchdog(); -- cgit v1.2.3 From 1e42e83fde5537266c1d1e7fd8c010b3028d50fc Mon Sep 17 00:00:00 2001 From: Geunsik Lim Date: Wed, 8 Feb 2012 19:05:36 +0900 Subject: ftrace: sched_switch plugin is deprecated Actually, sched_switch function tracer is merged into wakeup/wakeup_rt Update 'mini-HOWTO' for ftrace(Kernel function tracer). If we want to trace "sched:sched_switch" to trace sched_switch func, We may utilize event option.(e.g: trace-cmd list -e | grep sched) This patch is based on Linux-3.3.rc2-SMP-PREEMPT Link: http://lkml.kernel.org/r/1328695537-15081-1-git-send-email-geunsik.lim@gmail.com Cc: Randy Dunlap Signed-off-by: Geunsik Lim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5d2a00..10d5503f0d04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2764,12 +2764,12 @@ static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "# mount -t debugfs nodev /sys/kernel/debug\n\n" "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" + "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" "nop\n" - "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" + "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" - "sched_switch\n" + "wakeup\n" "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" -- cgit v1.2.3 From 95100358491abaa2e9a5483811370059bbca4645 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Nov 2011 20:03:08 +0100 Subject: printk/tracing: Add console output tracing Add a printk.console trace point to record any printk messages into the trace, regardless of the current console loglevel. This can help correlate (existing) printk debugging with other tracing. Link: http://lkml.kernel.org/r/1322161388.5366.54.camel@jlt3.sipsolutions.net Acked-by: Frederic Weisbecker Cc: Christoph Hellwig Cc: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Signed-off-by: Johannes Berg Signed-off-by: Steven Rostedt --- include/trace/events/printk.h | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/printk.c | 5 +++++ 2 files changed, 46 insertions(+) create mode 100644 include/trace/events/printk.h (limited to 'kernel') diff --git a/include/trace/events/printk.h b/include/trace/events/printk.h new file mode 100644 index 000000000000..94ec79cc011a --- /dev/null +++ b/include/trace/events/printk.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM printk + +#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PRINTK_H + +#include + +TRACE_EVENT_CONDITION(console, + TP_PROTO(const char *log_buf, unsigned start, unsigned end, + unsigned log_buf_len), + + TP_ARGS(log_buf, start, end, log_buf_len), + + TP_CONDITION(start != end), + + TP_STRUCT__entry( + __dynamic_array(char, msg, end - start + 1) + ), + + TP_fast_assign( + if ((start & (log_buf_len - 1)) > (end & (log_buf_len - 1))) { + memcpy(__get_dynamic_array(msg), + log_buf + (start & (log_buf_len - 1)), + log_buf_len - (start & (log_buf_len - 1))); + memcpy((char *)__get_dynamic_array(msg) + + log_buf_len - (start & (log_buf_len - 1)), + log_buf, end & (log_buf_len - 1)); + } else + memcpy(__get_dynamic_array(msg), + log_buf + (start & (log_buf_len - 1)), + end - start); + ((char *)__get_dynamic_array(msg))[end - start] = 0; + ), + + TP_printk("%s", __get_str(msg)) +); +#endif /* _TRACE_PRINTK_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..cb8a6bd697c6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -44,6 +44,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Architectures can override it: */ @@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" static void _call_console_drivers(unsigned start, unsigned end, int msg_log_level) { + trace_console(&LOG_BUF(0), start, end, log_buf_len); + if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { -- cgit v1.2.3 From 47b0edcb599ea6eb9ef16d3a08932a0e01485293 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Tue, 29 Nov 2011 22:08:00 +0100 Subject: tracing/trivial: Use kcalloc instead of kzalloc to allocate array The advantage of kcalloc is, that will prevent integer overflows which could result from the multiplication of number of elements and size and it is also a bit nicer to read. The semantic patch that makes this change is available in https://lkml.org/lkml/2011/11/25/107 Link: http://lkml.kernel.org/r/1322600880.1534.347.camel@localhost.localdomain Signed-off-by: Thomas Meyer Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_events_filter.c | 7 +++---- kernel/trace/trace_syscalls.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e2e0597c0845..d1499e910fe8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1129,7 +1129,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return NULL; size = 1 << size_bits; - hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); if (!hash->buckets) { kfree(hash); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 24aee7127451..76afaee99dbc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -685,7 +685,7 @@ find_event_field(struct ftrace_event_call *call, char *name) static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { - stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); if (!stack->preds) return -ENOMEM; stack->index = n_preds; @@ -826,8 +826,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) __free_preds(filter); - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); if (!filter->preds) return -ENOMEM; @@ -1486,7 +1485,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) children = count_leafs(preds, &preds[root->left]); children += count_leafs(preds, &preds[root->right]); - root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); if (!root->ops) return -ENOMEM; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb654542c1a1..43500153dd1e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void) unsigned long addr; int i; - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); + syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), + GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; -- cgit v1.2.3 From e404b321dbb2d6e438522b7dce9c1d0c6a8c5275 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Sun, 19 Feb 2012 14:16:07 +0300 Subject: tracing: Don't print an extra separator of flags If __print_flags() is used after another __print_*() function, the temp seq_file buffer will not be empty on entry, and the delimiter will be printed even though there's just one field. We get something like: |S instead of just: S This is because the length of the temp seq buffer is used to determine if the delimiter is printed or not. But this algorithm fails when the seq buffer is not empty on entry, and the delimiter will be printed because it thinks that a previous field was already printed. Link: http://lkml.kernel.org/r/1329650167-480655-1-git-send-email-avagin@openvz.org Signed-off-by: Andrew Vagin Cc: Ingo Molnar Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..3efd718984fb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long mask; const char *str; const char *ret = p->buffer + p->len; - int i; + int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -310,8 +310,10 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, str = flag_array[i].name; flags &= ~mask; - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); + else + first = 0; trace_seq_puts(p, str); } -- cgit v1.2.3 From 5b34926114e39e12005031269613d2b13194aeba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Feb 2012 20:37:32 -0500 Subject: tracing: Don't use p->len field to determine output in __print_*() functions If more than one __print_*() function is used in a tracepoint (__print_flags(), __print_symbols(), etc), then the temp seq buffer will not be zero on entry. Using the temp seq buffer's length to know if data has been printed or not in the current function is incorrect and may produce incorrect results. Currently, no in-tree tracepoint causes this bug, but new ones may be created. Cc: Andrew Vagin Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3efd718984fb..c5a01873567d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -319,7 +319,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, /* check for left over flags */ if (flags) { - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); trace_seq_printf(p, "0x%lx", flags); } @@ -346,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -372,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); -- cgit v1.2.3 From e248491ac283b516958ca9ab62c8e74b6718bca8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:48 +0100 Subject: ftrace: Add enable/disable ftrace_ops control interface Adding a way to temporarily enable/disable ftrace_ops. The change follows the same way as 'global' ftrace_ops are done. Introducing 2 global ftrace_ops - control_ops and ftrace_control_list which take over all ftrace_ops registered with FTRACE_OPS_FL_CONTROL flag. In addition new per cpu flag called 'disabled' is also added to ftrace_ops to provide the control information for each cpu. When ftrace_ops with FTRACE_OPS_FL_CONTROL is registered, it is set as disabled for all cpus. The ftrace_control_list contains all the registered 'control' ftrace_ops. The control_ops provides function which iterates ftrace_control_list and does the check for 'disabled' flag on current cpu. Adding 3 inline functions: ftrace_function_local_disable/ftrace_function_local_enable - enable/disable the ftrace_ops on current cpu ftrace_function_local_disabled - get disabled ftrace_ops::disabled value for current cpu Link: http://lkml.kernel.org/r/1329317514-8131-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 66 +++++++++++++++++++++++++++++ kernel/trace/ftrace.c | 111 +++++++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 2 + 3 files changed, 172 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f33fb3b041c6..64a309d2fbd6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,16 +31,33 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); +/* + * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are + * set in the flags member. + * + * ENABLED - set/unset when ftrace_ops is registered/unregistered + * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops + * is part of the global tracers sharing the same filter + * via set_ftrace_* debugfs files. + * DYNAMIC - set when ftrace_ops is registered to denote dynamically + * allocated ftrace_ops which need special care + * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops + * could be controled by following calls: + * ftrace_function_local_enable + * ftrace_function_local_disable + */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, FTRACE_OPS_FL_DYNAMIC = 1 << 2, + FTRACE_OPS_FL_CONTROL = 1 << 3, }; struct ftrace_ops { ftrace_func_t func; struct ftrace_ops *next; unsigned long flags; + int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; @@ -97,6 +114,55 @@ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); void clear_ftrace_function(void); +/** + * ftrace_function_local_enable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_enable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))--; +} + +/** + * ftrace_function_local_disable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_disable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))++; +} + +/** + * ftrace_function_local_disabled - returns ftrace_ops disabled value + * on current cpu + * + * This function returns value of ftrace_ops::disabled on current cpu. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) +{ + WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)); + return *this_cpu_ptr(ops->disabled); +} + extern void ftrace_stub(unsigned long a0, unsigned long a1); #else /* !CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1499e910fe8..f615f974d90e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,6 +62,8 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { }; static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); @@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void control_ops_disable_all(struct ftrace_ops *ops) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ + int __percpu *disabled; + + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void update_global_ops(void) { ftrace_func_t func; @@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ftrace_disabled) @@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + /* We don't support both control and global flags set. */ + if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) + return -EINVAL; + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); + add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else add_ftrace_ops(&ftrace_ops_list, ops); @@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -EINVAL; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + ret = remove_ftrace_list_ops(&ftrace_global_list, + &global_ops, ops); if (!ret) ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); + if (!ret) { + /* + * The ftrace_ops is now removed from the list, + * so there'll be no new users. We must ensure + * all current users are done before we free + * the control data. + */ + synchronize_sched(); + control_ops_free(ops); + } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -3873,6 +3940,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ +static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + op = rcu_dereference_raw(ftrace_control_list); + while (op != &ftrace_list_end) { + if (!ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + + op = rcu_dereference_raw(op->next); + }; + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, +}; + static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6d..55c6ea00f28a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -288,6 +288,8 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) #define TRACE_GLOBAL_BIT (1<<12) +#define TRACE_CONTROL_BIT (1<<13) + /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function -- cgit v1.2.3 From ceec0b6fc7cd43b38a40c2d40223f9cd0616f0cd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:49 +0100 Subject: ftrace, perf: Add open/close tracepoint perf registration actions Adding TRACE_REG_PERF_OPEN and TRACE_REG_PERF_CLOSE to differentiate register/unregister from open/close actions. The register/unregister actions are invoked for the first/last tracepoint user when opening/closing the event. The open/close actions are invoked for each tracepoint user when opening/closing the event. Link: http://lkml.kernel.org/r/1329317514-8131-3-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 6 ++- kernel/trace/trace_event_perf.c | 116 ++++++++++++++++++++++++++-------------- kernel/trace/trace_events.c | 10 ++-- kernel/trace/trace_kprobe.c | 6 ++- kernel/trace/trace_syscalls.c | 14 +++-- 5 files changed, 101 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c3da42dd22ba..195e3606ddd7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -146,6 +146,8 @@ enum trace_reg { TRACE_REG_UNREGISTER, TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, + TRACE_REG_PERF_OPEN, + TRACE_REG_PERF_CLOSE, }; struct ftrace_event_call; @@ -157,7 +159,7 @@ struct ftrace_event_class { void *perf_probe; #endif int (*reg)(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); int (*define_fields)(struct ftrace_event_call *); struct list_head *(*get_fields)(struct ftrace_event_call *); struct list_head fields; @@ -165,7 +167,7 @@ struct ftrace_event_class { }; extern int ftrace_event_reg(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); enum { TRACE_EVENT_FL_ENABLED_BIT, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d5..0cfcc37f63de 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -44,23 +44,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret; + int ret = -ENOMEM; int cpu; - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; - ret = -ENOMEM; - list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -83,7 +77,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; @@ -108,6 +102,69 @@ fail: return ret; } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; @@ -130,6 +187,14 @@ int perf_trace_init(struct perf_event *p_event) return ret; } +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; @@ -154,37 +219,6 @@ void perf_trace_del(struct perf_event *p_event, int flags) hlist_del_rcu(&p_event->hlist_entry); } -void perf_trace_destroy(struct perf_event *p_event) -{ - struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); -} - __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, struct pt_regs *regs, int *rctxp) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934ec..5138fea37908 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -170,6 +171,9 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) call->class->perf_probe, call); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; @@ -209,7 +213,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_stop_cmdline_record(); call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; } - call->class->reg(call, TRACE_REG_UNREGISTER); + call->class->reg(call, TRACE_REG_UNREGISTER, NULL); } break; case 1: @@ -218,7 +222,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_start_cmdline_record(); call->flags |= TRACE_EVENT_FL_RECORDED_CMD; } - ret = call->class->reg(call, TRACE_REG_REGISTER); + ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a4..5667f8958cc9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, #endif /* CONFIG_PERF_EVENTS */ static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; @@ -1909,6 +1910,9 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) case TRACE_REG_PERF_UNREGISTER: disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 43500153dd1e..e23515f51ed4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); @@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -664,13 +664,16 @@ static int syscall_enter_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; } static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -685,6 +688,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; -- cgit v1.2.3 From 489c75c3b333dfda4c8d2b7ad1b00e5da024bfa7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:50 +0100 Subject: ftrace, perf: Add add/del tracepoint perf registration actions Adding TRACE_REG_PERF_ADD and TRACE_REG_PERF_DEL to handle perf event schedule in/out actions. The add action is invoked for when the perf event is scheduled in, while the del action is invoked when the event is scheduled out. Link: http://lkml.kernel.org/r/1329317514-8131-4-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_event_perf.c | 4 +++- kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_kprobe.c | 2 ++ kernel/trace/trace_syscalls.c | 4 ++++ 5 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 195e3606ddd7..2bf677cb2d6f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -148,6 +148,8 @@ enum trace_reg { TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, + TRACE_REG_PERF_ADD, + TRACE_REG_PERF_DEL, }; struct ftrace_event_call; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0cfcc37f63de..d72af0b03822 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -211,12 +211,14 @@ int perf_trace_add(struct perf_event *p_event, int flags) list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); - return 0; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); } void perf_trace_del(struct perf_event *p_event, int flags) { + struct ftrace_event_call *tp_event = p_event->tp_event; hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5138fea37908..079a93ae8a9d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -173,6 +173,8 @@ int ftrace_event_reg(struct ftrace_event_call *call, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5667f8958cc9..580a05ec926b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1912,6 +1912,8 @@ int kprobe_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e23515f51ed4..96fc73369099 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -666,6 +666,8 @@ static int syscall_enter_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } @@ -690,6 +692,8 @@ static int syscall_exit_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } -- cgit v1.2.3 From e59a0bff3ecf389951e3c9378ddfd00f6448bfaa Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:51 +0100 Subject: ftrace: Add FTRACE_ENTRY_REG macro to allow event registration Adding FTRACE_ENTRY_REG macro so particular ftrace entries could specify registration function and thus become accesible via perf. This will be used in upcomming patch for function trace. Link: http://lkml.kernel.org/r/1329317514-8131-5-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ++++ kernel/trace/trace_export.c | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55c6ea00f28a..638476af8d7b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -68,6 +68,10 @@ enum trace_type { #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + #include "trace_entries.h" /* diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..f74de861cde9 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,14 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) @@ -152,13 +160,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef F_printk #define F_printk(fmt, args...) #fmt ", " __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn)\ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ }; \ \ struct ftrace_event_call __used event_##call = { \ @@ -170,4 +179,9 @@ struct ftrace_event_call __used event_##call = { \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), NULL) + #include "trace_entries.h" -- cgit v1.2.3 From ced39002f5ea736b716ae233fb68b26d59783912 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:52 +0100 Subject: ftrace, perf: Add support to use function tracepoint in perf Adding perf registration support for the ftrace function event, so it is now possible to register it via perf interface. The perf_event struct statically contains ftrace_ops as a handle for function tracer. The function tracer is registered/unregistered in open/close actions. To be efficient, we enable/disable ftrace_ops each time the traced process is scheduled in/out (via TRACE_REG_PERF_(ADD|DELL) handlers). This way tracing is enabled only when the process is running. Intentionally using this way instead of the event's hw state PERF_HES_STOPPED, which would not disable the ftrace_ops. It is now possible to use function trace within perf commands like: perf record -e ftrace:function ls perf stat -e ftrace:function ls Allowed only for root. Link: http://lkml.kernel.org/r/1329317514-8131-6-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/perf_event.h | 3 ++ kernel/trace/trace.h | 11 ++++++ kernel/trace/trace_entries.h | 6 ++- kernel/trace/trace_event_perf.c | 86 +++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_export.c | 5 +++ 5 files changed, 109 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..92a056f6d18d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,6 +859,9 @@ struct perf_event { #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; struct event_filter *filter; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops ftrace_ops; +#endif #endif #ifdef CONFIG_CGROUP_PERF diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 638476af8d7b..76a1c5094bbf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -595,6 +595,8 @@ static inline int ftrace_trace_task(struct task_struct *task) static inline int ftrace_is_dead(void) { return 0; } #endif +int ftrace_event_is_function(struct ftrace_event_call *call); + /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found @@ -832,4 +834,13 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_FUNCTION_TRACER +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_PERF_EVENTS */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..47db7eda0531 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@ /* * Function trace entry - function address and parent function address: */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, @@ -64,7 +64,9 @@ FTRACE_ENTRY(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + perf_ftrace_event_register ); /* Function call entry */ diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index d72af0b03822..fdeeb5c49627 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,11 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; @@ -250,3 +255,84 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + + head = this_cpu_ptr(event_function.perf_events); + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + return unregister_ftrace_function(ops); +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f74de861cde9..a3dbee6d04cd 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -184,4 +184,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), NULL) +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + #include "trace_entries.h" -- cgit v1.2.3 From 02aa3162edaa166a01d193f80ccde890be8b55da Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:53 +0100 Subject: ftrace: Allow to specify filter field type for ftrace events Adding FILTER_TRACE_FN event field type for function tracepoint event, so it can be properly recognized within filtering code. Currently all fields of ftrace subsystem events share the common field type FILTER_OTHER. Since the function trace fields need special care within the filtering code we need to recognize it properly, hence adding the FILTER_TRACE_FN event type. Adding filter parameter to the FTRACE_ENTRY macro, to specify the filter field type for the event. Link: http://lkml.kernel.org/r/1329317514-8131-7-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.h | 23 +++++++++-------- kernel/trace/trace_entries.h | 48 +++++++++++++++++++++++++---------- kernel/trace/trace_events_filter.c | 7 +++++- kernel/trace/trace_export.c | 51 +++++++++++++++++++++----------------- 5 files changed, 83 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2bf677cb2d6f..dd478fc8f9f5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -245,6 +245,7 @@ enum { FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_PTR_STRING, + FILTER_TRACE_FN, }; #define EVENT_STORAGE_SIZE 128 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 76a1c5094bbf..29f93cd434a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,21 +56,23 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ - struct struct_name { \ - struct trace_entry ent; \ - tstruct \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ } #undef TP_ARGS #define TP_ARGS(args...) args #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -826,12 +828,13 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 47db7eda0531..d91eb0541b3a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -66,6 +66,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + FILTER_TRACE_FN, + perf_ftrace_event_register ); @@ -80,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER ); /* Function return entry */ @@ -100,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth) + __entry->depth), + + FILTER_OTHER ); /* @@ -129,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -148,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -171,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -187,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); /* @@ -204,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt) + __entry->ip, __entry->fmt), + + FILTER_OTHER ); FTRACE_ENTRY(print, print_entry, @@ -217,7 +231,9 @@ FTRACE_ENTRY(print, print_entry, ), F_printk("%08lx %s", - __entry->ip, __entry->buf) + __entry->ip, __entry->buf), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -236,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width) + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -254,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode) + __entry->map_id, __entry->opcode), + + FILTER_OTHER ); @@ -274,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)", __entry->line, - __entry->func, __entry->file, __entry->correct) + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER ); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 76afaee99dbc..3da3d0ec3584 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -899,6 +899,11 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -986,7 +991,7 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; - } else { + } else if (!is_function_field(field)) { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); else diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index a3dbee6d04cd..7b46c9bd22ae 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -23,8 +23,10 @@ * function and thus become accesible via perf. */ #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) /* not needed for this file */ #undef __field_struct @@ -52,21 +54,22 @@ #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -75,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -85,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -99,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ @@ -112,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -120,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ + 0, is_signed_type(type), filter_type);\ if (ret) \ return ret; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ int \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ + int filter_type = filter; \ \ tstruct; \ \ @@ -161,7 +165,8 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define F_printk(fmt, args...) #fmt ", " __stringify(args) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn)\ +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ @@ -180,9 +185,9 @@ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ - PARAMS(tstruct), PARAMS(print), NULL) + PARAMS(tstruct), PARAMS(print), filter, NULL) int ftrace_event_is_function(struct ftrace_event_call *call) { -- cgit v1.2.3 From 5500fa51199aee770ce53718853732600543619e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:54 +0100 Subject: ftrace, perf: Add filter support for function trace event Adding support to filter function trace event via perf interface. It is now possible to use filter interface in the perf tool like: perf record -e ftrace:function --filter="(ip == mm_*)" ls The filter syntax is restricted to the the 'ip' field only, and following operators are accepted '==' '!=' '||', ending up with the filter strings like: ip == f1[, ]f2 ... || ip != f3[, ]f4 ... with comma ',' or space ' ' as a function separator. If the space ' ' is used as a separator, the right side of the assignment needs to be enclosed in double quotes '"', e.g.: perf record -e ftrace:function --filter '(ip == do_execve,sys_*,ext*)' ls perf record -e ftrace:function --filter '(ip == "do_execve,sys_*,ext*")' ls perf record -e ftrace:function --filter '(ip == "do_execve sys_* ext*")' ls The '==' operator adds trace filter with same effect as would be added via set_ftrace_filter file. The '!=' operator adds trace filter with same effect as would be added via set_ftrace_notrace file. The right side of the '!=', '==' operators is list of functions or regexp. to be added to filter separated by space. The '||' operator is used for connecting multiple filter definitions together. It is possible to have more than one '==' and '!=' operators within one filter string. Link: http://lkml.kernel.org/r/1329317514-8131-8-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 6 ++ kernel/trace/trace.h | 2 - kernel/trace/trace_event_perf.c | 4 +- kernel/trace/trace_events_filter.c | 165 +++++++++++++++++++++++++++++++++++-- 5 files changed, 172 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 64a309d2fbd6..72a6cabb4d5b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); +void ftrace_free_filter(struct ftrace_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -380,9 +381,6 @@ extern void ftrace_enable_daemon(void); #else static inline int skip_trace(unsigned long ip) { return 0; } static inline int ftrace_force_update(void) { return 0; } -static inline void ftrace_set_filter(unsigned char *buf, int len, int reset) -{ -} static inline void ftrace_disable_daemon(void) { } static inline void ftrace_enable_daemon(void) { } static inline void ftrace_release_mod(struct module *mod) {} @@ -406,6 +404,9 @@ static inline int ftrace_text_reserved(void *start, void *end) */ #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) +#define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_free_filter(ops) do { } while (0) static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return -ENODEV; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f615f974d90e..867bd1dd2dd0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1186,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); } +void ftrace_free_filter(struct ftrace_ops *ops) +{ + free_ftrace_hash(ops->filter_hash); + free_ftrace_hash(ops->notrace_hash); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 29f93cd434a5..54faec790bc1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -776,9 +776,7 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST struct ftrace_event_field *field; -#endif int offset; int not; int op; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fdeeb5c49627..fee3752ae8f6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -298,7 +298,9 @@ static int perf_ftrace_function_register(struct perf_event *event) static int perf_ftrace_function_unregister(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - return unregister_ftrace_function(ops); + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; } static void perf_ftrace_function_enable(struct perf_event *event) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3da3d0ec3584..431dba8b7542 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -81,6 +81,7 @@ enum { FILT_ERR_TOO_MANY_PREDS, FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, }; static char *err_text[] = { @@ -96,6 +97,7 @@ static char *err_text[] = { "Too many terms in predicate expression", "Missing field name and/or value", "Meaningless filter expression", + "Only 'ip' field is supported for function trace", }; struct opstack_op { @@ -991,7 +993,12 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; - } else if (!is_function_field(field)) { + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } + } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); else @@ -1338,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST pred.field = field; -#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1954,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event) __free_filter(filter); } +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + while ((sep = strchr(str, ','))) + *sep = ' '; + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; + + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) +{ + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } + + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { @@ -1974,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; err = create_filter(call, filter_str, false, &filter); - if (!err) - event->filter = filter; + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: -- cgit v1.2.3 From fadf0464b83f91ba021a358c0238a0810c0d2a0b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 21 Feb 2012 15:02:53 -0500 Subject: jump label: Add a WARN() if jump label key count goes negative The count on a jump label key should never go negative. Add a WARN() to check for this condition. Signed-off-by: Jason Baron Cc: Gleb Natapov Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/3c68556121be4d1920417a3fe367da1ec38246b4.1329851692.git.jbaron@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01d3b70fc98a..ed9654fd7d27 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -76,8 +76,11 @@ EXPORT_SYMBOL_GPL(jump_label_inc); static void __jump_label_dec(struct jump_label_key *key, unsigned long rate_limit, struct delayed_work *work) { - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { + WARN(atomic_read(&key->enabled) < 0, + "jump label: negative count!\n"); return; + } if (rate_limit) { atomic_inc(&key->enabled); -- cgit v1.2.3 From a746e3cc984b0aa5b620dd07c1a433283b1835cf Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 21 Feb 2012 15:02:57 -0500 Subject: jump label: Fix compiler warning While cross-compiling on sparc64, I found: kernel/jump_label.c: In function 'jump_label_update': kernel/jump_label.c:447:40: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Fix by casting to 'unsigned long'. Signed-off-by: Jason Baron Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/08026cbc6df80619cae833ef1ebbbc43efab69ab.1329851692.git.jbaron@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ed9654fd7d27..543782e7cdd2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -424,7 +424,7 @@ static void jump_label_update(struct jump_label_key *key, int enable) struct jump_entry *entry = key->entries, *stop = __stop___jump_table; #ifdef CONFIG_MODULES - struct module *mod = __module_address((jump_label_t)key); + struct module *mod = __module_address((unsigned long)key); __jump_label_mod_update(key, enable); -- cgit v1.2.3 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- arch/Kconfig | 29 ++++--- arch/ia64/include/asm/paravirt.h | 6 +- arch/ia64/kernel/paravirt.c | 4 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +- arch/x86/include/asm/paravirt.h | 6 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/paravirt.c | 4 +- arch/x86/kvm/mmu_audit.c | 8 +- include/linux/jump_label.h | 139 ++++++++++++++++++++++++---------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 +-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +- include/net/sock.h | 6 +- kernel/events/core.c | 16 ++-- kernel/jump_label.c | 128 ++++++++++++++++++------------- kernel/sched/core.c | 18 ++--- kernel/sched/fair.c | 8 +- kernel/sched/sched.h | 14 ++-- kernel/tracepoint.c | 20 ++--- net/core/dev.c | 24 +++--- net/core/net-sysfs.c | 4 +- net/core/sock.c | 4 +- net/core/sysctl_net_core.c | 4 +- net/ipv4/tcp_memcontrol.c | 6 +- net/netfilter/core.c | 6 +- 31 files changed, 298 insertions(+), 205 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..5b448a74d0f7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -47,18 +47,29 @@ config KPROBES If in doubt, say "N". config JUMP_LABEL - bool "Optimize trace point call sites" + bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL help + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. + + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. + If it is detected that the compiler has support for "asm goto", - the kernel will compile trace point locations with just a - nop instruction. When trace points are enabled, the nop will - be converted to a jump to the trace function. This technique - lowers overhead and stress on the branch prediction of the - processor. - - On i386, options added to the compiler flags may increase - the size of the kernel slightly. + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. + + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. + + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config OPTPROBES def_bool y diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 32551d304cd7..b149b88ea795 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu) pv_time_ops.init_missing_ticks_accounting(cpu); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline int paravirt_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 100868216c55..1b22f6de2932 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = { * pv_time_ops * time operations */ -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static int ia64_native_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 1881b316ca45..4d6d77ed9b9d 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,7 +20,7 @@ #define WORD_INSN ".word" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\tnop\n\t" "nop\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 938986e412f1..ae098c438f00 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 95a6cf2b5b67..6c32190dc73e 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -13,7 +13,7 @@ #define ASM_ALIGN ".balign 4" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index fc73a82366f8..5080d16a832f 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,7 +7,7 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a32b18ce6ead..3a16c1483b45 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -9,12 +9,12 @@ #define JUMP_LABEL_NOP_SIZE 5 -#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - JUMP_LABEL_INITIAL_NOP + STATIC_KEY_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74fb..c0180fd372d2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline u64 paravirt_steal_clock(int cpu) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void) static __init int activate_jump_labels(void) { if (has_steal_clock) { - jump_label_inc(¶virt_steal_enabled); + static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) - jump_label_inc(¶virt_steal_rq_enabled); + static_key_slow_inc(¶virt_steal_rq_enabled); } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..ada2f99388dd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..ea7b4fd34676 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea7..2172da2d9bb4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..7dfaae7846ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d0..29734be334c1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..0d21e6f1cf53 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 000000000000..27bd3f8a0857 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7e..bd96ecd0e05c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf020..dcde2d9268cd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #endif /* SOCK_REFCNT_DEBUG */ #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) -extern struct jump_label_key memcg_socket_limit_enabled; +extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) { return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); } -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled) +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 static inline struct cg_proto *parent_cg_proto(struct proto *proto, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c3b9de55f6b..5e0f8bb89b2b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); } } @@ -4982,7 +4982,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5843,7 +5843,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 543782e7cdd2..bf9dcadbb53a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifdef HAVE_JUMP_LABEL @@ -29,10 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) +bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } +EXPORT_SYMBOL_GPL(static_key_enabled); static int jump_label_cmp(const void *a, const void *b) { @@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { @@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key, if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -200,8 +221,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES struct module *mod = __module_address((unsigned long)key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..112c6824476b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..423547ada38a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..b4cd6d8ea150 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include +# include # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..da7ce7f0e566 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,7 +134,7 @@ #include #include #include -#include +#include #include #include "net-sysfs.h" @@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct jump_label_key netstamp_needed __read_mostly; +static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call jump_label_dec() from irq context +/* We are not allowed to call static_key_slow_dec() from irq context * If net_disable_timestamp() is called from irq context, defer the - * jump_label_dec() calls. + * static_key_slow_dec() calls. */ static atomic_t netstamp_needed_deferred; #endif @@ -1457,12 +1457,12 @@ void net_enable_timestamp(void) if (deferred) { while (--deferred) - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); return; } #endif WARN_ON(in_interrupt()); - jump_label_inc(&netstamp_needed); + static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1474,19 +1474,19 @@ void net_disable_timestamp(void) return; } #endif - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp.tv64 = 0; - if (static_branch(&netstamp_needed)) + if (static_key_false(&netstamp_needed)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ - if (static_branch(&netstamp_needed)) { \ + if (static_key_false(&netstamp_needed)) { \ if ((COND) && !(SKB)->tstamp.tv64) \ __net_timestamp(SKB); \ } \ @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); -struct jump_label_key rps_needed __read_mostly; +struct static_key rps_needed __read_mostly; static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a1727cda03d7..495586232aa1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (map) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (old_map) { kfree_rcu(old_map, rcu); - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); } free_cpumask_var(mask); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..3a4e5817a2a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,7 +111,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -struct jump_label_key memcg_socket_limit_enabled; +struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d05559d4d9cd..0c2850874254 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (orig_sock_table) { - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..602fb305365f 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) net->ipv4.sysctl_tcp_mem[i]); if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - jump_label_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_socket_limit_enabled); return 0; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b301..e1b7e051332e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } -- cgit v1.2.3 From 8eedce996556d7d06522cd3a0e6069141c8dffe0 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 28 Feb 2012 13:49:01 -0500 Subject: static keys: Inline the static_key_enabled() function In the jump label enabled case, calling static_key_enabled() results in a function call. The function returns the results of a compare, so it really doesn't need the overhead of a full function call. Let's make it 'static inline' for both the jump label enabled and disabled cases. Signed-off-by: Jason Baron Cc: a.p.zijlstra@chello.nl Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@polymtl.ca Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/201202281849.q1SIn1p2023270@int-mx10.intmail.prod.int.phx2.redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 11 +++++------ kernel/jump_label.c | 6 ------ 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2172da2d9bb4..c513a40510f5 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -127,7 +127,6 @@ extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern void static_key_slow_dec_deferred(struct static_key_deferred *key); -extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); @@ -198,11 +197,6 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool static_key_enabled(struct static_key *key) -{ - return (atomic_read(&key->enabled) > 0); -} - static inline int jump_label_apply_nops(struct module *mod) { return 0; @@ -224,4 +218,9 @@ jump_label_rate_limit(struct static_key_deferred *key, #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled +static inline bool static_key_enabled(struct static_key *key) +{ + return (atomic_read(&key->enabled) > 0); +} + #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bf9dcadbb53a..43049192b5ec 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -29,12 +29,6 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool static_key_enabled(struct static_key *key) -{ - return (atomic_read(&key->enabled) > 0); -} -EXPORT_SYMBOL_GPL(static_key_enabled); - static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; -- cgit v1.2.3 From bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:51 +0100 Subject: perf: Add generic taken branch sampling support This patch adds the ability to sample taken branches to the perf_event interface. The ability to capture taken branches is very useful for all sorts of analysis. For instance, basic block profiling, call counts, statistical call graph. This new capability requires hardware assist and as such may not be available on all HW platforms. On Intel x86 it is implemented on top of the Last Branch Record (LBR) facility. To enable taken branches sampling, the PERF_SAMPLE_BRANCH_STACK bit must be set in attr->sample_type. Sampled taken branches may be filtered by type and/or priv levels. The patch adds a new field, called branch_sample_type, to the perf_event_attr structure. It contains a bitmask of filters to apply to the sampled taken branches. Filters may be implemented in HW. If the HW filter does not exist or is not good enough, some arch may also implement a SW filter. The following generic filters are currently defined: - PERF_SAMPLE_USER only branches whose targets are at the user level - PERF_SAMPLE_KERNEL only branches whose targets are at the kernel level - PERF_SAMPLE_HV only branches whose targets are at the hypervisor level - PERF_SAMPLE_ANY any type of branches (subject to priv levels filters) - PERF_SAMPLE_ANY_CALL any call branches (may incl. syscall on some arch) - PERF_SAMPLE_ANY_RET any return branches (may incl. syscall returns on some arch) - PERF_SAMPLE_IND_CALL indirect call branches Obviously filter may be combined. The priv level bits are optional. If not provided, the priv level of the associated event are used. It is possible to collect branches at a priv level different from the associated event. Use of kernel, hv priv levels is subject to permissions and availability (hv). The number of taken branch records present in each sample may vary based on HW, the type of sampled branches, the executed code. Therefore each sample contains the number of taken branches it contains. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 +++++---- include/linux/perf_event.h | 71 ++++++++++++++++++++++++++++-- kernel/events/core.c | 68 ++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 47a7e63bfe54..309d0cc69163 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -142,9 +142,11 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].flags = 0; + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } @@ -165,19 +167,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, flags = 0; + u64 from, to, mis = 0, pred = 0; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); if (lbr_format == LBR_FORMAT_EIP_FLAGS) { - flags = !!(from & LBR_FROM_FLAG_MISPRED); + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; from = (u64)((((s64)from) << 1) >> 1); } - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].flags = flags; + cpuc->lbr_entries[i].from = from; + cpuc->lbr_entries[i].to = to; + cpuc->lbr_entries[i].mispred = mis; + cpuc->lbr_entries[i].predicted = pred; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 64426b71381f..5fc494f4a094 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,10 +129,39 @@ enum perf_event_sample_format { PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ }; +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + + PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ +}; + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + /* * The format of the data returned by read() on a perf event fd, * as specified by attr.read_format: @@ -240,6 +269,7 @@ struct perf_event_attr { __u64 bp_len; __u64 config2; /* extension of config1 */ }; + __u64 branch_sample_type; /* enum branch_sample_type */ }; /* @@ -458,6 +488,8 @@ enum perf_event_type { * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK * }; */ PERF_RECORD_SAMPLE = 9, @@ -530,12 +562,34 @@ struct perf_raw_record { void *data; }; +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + */ struct perf_branch_entry { - __u64 from; - __u64 to; - __u64 flags; + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + reserved:62; }; +/* + * branch stack layout: + * nr: number of taken branches stored in entries[] + * + * Note that nr can vary from sample to sample + * branches (to, from) are stored from most recent + * to least recent, i.e., entries[0] contains the most + * recent branch. + */ struct perf_branch_stack { __u64 nr; struct perf_branch_entry entries[0]; @@ -566,7 +620,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + struct hw_perf_event_extra extra_reg; + struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; @@ -1007,12 +1063,14 @@ struct perf_sample_data { u64 period; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; + struct perf_branch_stack *br_stack; }; static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; + data->br_stack = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1151,6 +1209,11 @@ extern void perf_bp_event(struct perf_event *event, void *data); # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/kernel/events/core.c b/kernel/events/core.c index e8b32ac75ce3..5820efdf47cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ + (PERF_SAMPLE_BRANCH_KERNEL |\ + PERF_SAMPLE_BRANCH_HV) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -3907,6 +3914,24 @@ void perf_output_sample(struct perf_output_handle *handle, } } } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { + size_t size; + + size = data->br_stack->nr + * sizeof(struct perf_branch_entry); + + perf_output_put(handle, data->br_stack->nr); + perf_output_copy(handle, data->br_stack->entries, size); + } else { + /* + * we always store at least the value of nr + */ + u64 nr = 0; + perf_output_put(handle, nr); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3949,6 +3974,15 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += size; } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + int size = sizeof(u64); /* nr */ + if (data->br_stack) { + size += data->br_stack->nr + * sizeof(struct perf_branch_entry); + } + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -5935,6 +5969,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { + u64 mask = attr->branch_sample_type; + + /* only using defined bits */ + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) + return -EINVAL; + + /* at least one branch bit must be set */ + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) + return -EINVAL; + + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* propagate priv level, when not set for branch */ + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + + /* exclude_kernel checked on syscall entry */ + if (!attr->exclude_kernel) + mask |= PERF_SAMPLE_BRANCH_KERNEL; + + if (!attr->exclude_user) + mask |= PERF_SAMPLE_BRANCH_USER; + + if (!attr->exclude_hv) + mask |= PERF_SAMPLE_BRANCH_HV; + /* + * adjust user setting (for HW filter setup) + */ + attr->branch_sample_type = mask; + } + } out: return ret; -- cgit v1.2.3 From 2481c5fa6db0237e4f0168f88913178b2b495b7c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:59 +0100 Subject: perf: Disable PERF_SAMPLE_BRANCH_* when not supported PERF_SAMPLE_BRANCH_* is disabled for: - SW events (sw counters, tracepoints) - HW breakpoints - ALL but Intel x86 architecture - AMD64 processors Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-10-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 4 ++++ arch/arm/kernel/perf_event.c | 4 ++++ arch/mips/kernel/perf_event_mipsxx.c | 4 ++++ arch/powerpc/kernel/perf_event.c | 4 ++++ arch/sh/kernel/perf_event.c | 4 ++++ arch/sparc/kernel/perf_event.c | 4 ++++ arch/x86/kernel/cpu/perf_event_amd.c | 3 +++ kernel/events/core.c | 24 ++++++++++++++++++++++++ kernel/events/hw_breakpoint.c | 6 ++++++ 9 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8143cd7cdbfb..0dae252f7a33 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -685,6 +685,10 @@ static int alpha_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 5bb91bf3d47f..a23c42abc694 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -539,6 +539,10 @@ static int armpmu_event_init(struct perf_event *event) int err = 0; atomic_t *active_events = &armpmu->active_events; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (armpmu->map_event(event) == -ENOENT) return -ENOENT; diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index e3b897acfbc0..811084f4e422 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -606,6 +606,10 @@ static int mipspmu_event_init(struct perf_event *event) { int err = 0; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index f04c2301725e..c2e27ede07ec 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1084,6 +1084,10 @@ static int power_pmu_event_init(struct perf_event *event) if (!ppmu) return -ENOENT; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 10b14e3a7eb8..068b8a2759b5 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -310,6 +310,10 @@ static int sh_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HW_CACHE: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 614da624330c..8e16a4a21582 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1105,6 +1105,10 @@ static int sparc_pmu_event_init(struct perf_event *event) if (atomic_read(&nmi_active) < 0) return -ENODEV; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (attr->type) { case PERF_TYPE_HARDWARE: if (attr->config >= sparc_pmu->max_events) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 67250a52430b..dd002faff7a6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -139,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 diff --git a/kernel/events/core.c b/kernel/events/core.c index 5820efdf47cd..242bb51c67f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5044,6 +5044,12 @@ static int perf_swevent_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: @@ -5154,6 +5160,12 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + err = perf_trace_init(event); if (err) return err; @@ -5379,6 +5391,12 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5453,6 +5471,12 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3330022a7ac1..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; + /* + * no branch sampling for breakpoint events + */ + if (has_branch_stack(bp)) + return -EOPNOTSUPP; + err = register_perf_hw_breakpoint(bp); if (err) return err; -- cgit v1.2.3 From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:00 +0100 Subject: perf: Add callback to flush branch_stack on context switch With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 21 ++++++--- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++ include/linux/perf_event.h | 9 +++- kernel/events/core.c | 85 ++++++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cea567483274..0a18d16cb58d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, }; void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f104c054dc5c..74387c12dc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -324,6 +324,7 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + void (*flush_branch_stack)(void); /* * Intel Arch Perfmon v2+ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7cc1e2dcc4dd..6627089232a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, }; static __init void intel_clovertown_quirk(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5fc494f4a094..fbbf5e598368 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,6 +746,11 @@ struct pmu { * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ + + /* + * flush branch stack on context-switches (needed in cpu-wide mode) + */ + void (*flush_branch_stack) (void); }; /** @@ -979,7 +984,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - int nr_cgroups; /* cgroup events present */ + int nr_cgroups; /* cgroup evts */ + int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; }; @@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); + struct perf_sample_data { u64 type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 242bb51c67f2..c61234b1a988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -137,6 +137,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2201,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_rotate_start(ctx->pmu); } +/* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event) atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); static_key_slow_dec_deferred(&perf_sched_events); } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } if (event->rb) { @@ -5924,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; -- cgit v1.2.3