From 752be5c5c910a1a270e97b5b39c7a7d06a39e7c6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 1 Aug 2022 11:32:25 +0900 Subject: tracing/eprobe: Add eprobe filter support Add the filter option to the event probe. This is useful if user wants to derive a new event based on the condition of the original event. E.g. echo 'e:egroup/stat_runtime_4core sched/sched_stat_runtime \ runtime=$runtime:u32 if cpu < 4' >> ../dynamic_events Then it can filter the events only on first 4 cores. Note that the fields used for 'if' must be the fields in the original events, not eprobe events. Link: https://lkml.kernel.org/r/165932114513.2850673.2592206685744598080.stgit@devnote2 Cc: Tzvetomir Stoyanov Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 104 ++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_probe.h | 3 +- 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 1783e3478912..78299d3724a2 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -26,6 +26,9 @@ struct trace_eprobe { /* tracepoint event */ const char *event_name; + /* filter string for the tracepoint */ + char *filter_str; + struct trace_event_call *event; struct dyn_event devent; @@ -664,14 +667,15 @@ static struct event_trigger_data * new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) { struct event_trigger_data *trigger; + struct event_filter *filter = NULL; struct eprobe_data *edata; + int ret; edata = kzalloc(sizeof(*edata), GFP_KERNEL); trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); if (!trigger || !edata) { - kfree(edata); - kfree(trigger); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto error; } trigger->flags = EVENT_TRIGGER_FL_PROBE; @@ -686,13 +690,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->cmd_ops = &event_trigger_cmd; INIT_LIST_HEAD(&trigger->list); - RCU_INIT_POINTER(trigger->filter, NULL); + + if (ep->filter_str) { + ret = create_event_filter(file->tr, file->event_call, + ep->filter_str, false, &filter); + if (ret) + goto error; + } + RCU_INIT_POINTER(trigger->filter, filter); edata->file = file; edata->ep = ep; trigger->private_data = edata; return trigger; +error: + free_event_filter(filter); + kfree(edata); + kfree(trigger); + return ERR_PTR(ret); } static int enable_eprobe(struct trace_eprobe *ep, @@ -726,6 +742,7 @@ static int disable_eprobe(struct trace_eprobe *ep, { struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; + struct event_filter *filter; struct eprobe_data *edata; file = find_event_file(tr, ep->event_system, ep->event_name); @@ -752,6 +769,10 @@ static int disable_eprobe(struct trace_eprobe *ep, /* Make sure nothing is using the edata or trigger */ tracepoint_synchronize_unregister(); + filter = rcu_access_pointer(trigger->filter); + + if (filter) + free_event_filter(filter); kfree(edata); kfree(trigger); @@ -927,12 +948,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ return ret; } +static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) +{ + struct event_filter *dummy; + int i, ret, len = 0; + char *p; + + if (argc == 0) { + trace_probe_log_err(0, NO_EP_FILTER); + return -EINVAL; + } + + /* Recover the filter string */ + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + + ep->filter_str = kzalloc(len, GFP_KERNEL); + if (!ep->filter_str) + return -ENOMEM; + + p = ep->filter_str; + for (i = 0; i < argc; i++) { + ret = snprintf(p, len, "%s ", argv[i]); + if (ret < 0) + goto error; + if (ret > len) { + ret = -E2BIG; + goto error; + } + p += ret; + len -= ret; + } + p[-1] = '\0'; + + /* + * Ensure the filter string can be parsed correctly. Note, this + * filter string is for the original event, not for the eprobe. + */ + ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, + true, &dummy); + free_event_filter(dummy); + if (ret) + goto error; + + return 0; +error: + kfree(ep->filter_str); + ep->filter_str = NULL; + return ret; +} + static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] - * Fetch args: + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER] + * Fetch args (no space): * =$[:TYPE] */ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; @@ -942,8 +1013,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - int ret = 0; - int i; + int ret = 0, filter_idx = 0; + int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; @@ -973,6 +1044,15 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = buf1; } + for (i = 2; i < argc; i++) { + if (!strcmp(argv[i], "if")) { + filter_idx = i + 1; + filter_cnt = argc - filter_idx; + argc = i; + break; + } + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -988,6 +1068,14 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } + if (filter_idx) { + trace_probe_log_set_index(filter_idx); + ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); + if (ret) + goto parse_error; + } else + ep->filter_str = NULL; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 3b3869ae8cfd..de38f1c03776 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(SAME_PROBE, "There is already the exact same probe event"),\ C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ - C(BAD_ATTACH_ARG, "Attached event does not have this field"), + C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ + C(NO_EP_FILTER, "No filter rule after 'if'"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From 9e14bae7d049cfdd5ab22cb200bc7ea847cfa8c9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 1 Aug 2022 11:32:34 +0900 Subject: selftests/ftrace: Add eprobe syntax error testcase Add a syntax error test case for eprobe as same as kprobes. Link: https://lkml.kernel.org/r/165932115471.2850673.8014722990775242727.stgit@devnote2 Cc: Tzvetomir Stoyanov Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- .../test.d/dynevent/eprobes_syntax_errors.tc | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc new file mode 100644 index 000000000000..fc1daac7f066 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc @@ -0,0 +1,27 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Event probe event parser error log check +# requires: dynamic_events events/syscalls/sys_enter_openat ". []":README error_log + +check_error() { # command-with-error-pos-by-^ + ftrace_errlog_check 'event_probe' "$1" 'dynamic_events' +} + +check_error 'e ^a.' # NO_EVENT_INFO +check_error 'e ^.b' # NO_EVENT_INFO +check_error 'e ^a.b' # BAD_ATTACH_EVENT +check_error 'e syscalls/sys_enter_openat ^foo' # BAD_ATTACH_ARG +check_error 'e:^/bar syscalls/sys_enter_openat' # NO_GROUP_NAME +check_error 'e:^12345678901234567890123456789012345678901234567890123456789012345/bar syscalls/sys_enter_openat' # GROUP_TOO_LONG + +check_error 'e:^foo.1/bar syscalls/sys_enter_openat' # BAD_GROUP_NAME +check_error 'e:^ syscalls/sys_enter_openat' # NO_EVENT_NAME +check_error 'e:foo/^12345678901234567890123456789012345678901234567890123456789012345 syscalls/sys_enter_openat' # EVENT_TOO_LONG +check_error 'e:foo/^bar.1 syscalls/sys_enter_openat' # BAD_EVENT_NAME + +check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd' # BAD_FETCH_ARG +check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo' # BAD_ATTACH_ARG + +check_error 'e:foo/bar syscalls/sys_enter_openat if ^' # NO_EP_FILTER + +exit 0 -- cgit v1.2.3 From 01c44bf8337ac79f31aa090e842bf3b9dd762aca Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Wed, 24 Aug 2022 11:43:56 +0800 Subject: rv/monitors: add 'static' qualifier for local symbols The sparse tool complains as follows: kernel/trace/rv/monitors/wwnr/wwnr.c:18:19: warning: symbol 'rv_wwnr' was not declared. Should it be static? The `rv_wwnr` symbol is not dereferenced by other extern files, so add static qualifier for it. So does wip module. Link: https://lkml.kernel.org/r/20220824034357.2014202-2-zengheng4@huawei.com Cc: Fixes: ccc319dcb450 ("rv/monitor: Add the wwnr monitor") Fixes: 8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k") Signed-off-by: Zeng Heng Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.c | 4 ++-- kernel/trace/rv/monitors/wwnr/wwnr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 83cace53b9fa..1a989bc142f3 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -16,7 +16,7 @@ #include "wip.h" -struct rv_monitor rv_wip; +static struct rv_monitor rv_wip; DECLARE_DA_MON_PER_CPU(wip, unsigned char); static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) @@ -60,7 +60,7 @@ static void disable_wip(void) da_monitor_destroy_wip(); } -struct rv_monitor rv_wip = { +static struct rv_monitor rv_wip = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 599225d9cf38..a063b93c6a1d 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -15,7 +15,7 @@ #include "wwnr.h" -struct rv_monitor rv_wwnr; +static struct rv_monitor rv_wwnr; DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); static void handle_switch(void *data, bool preempt, struct task_struct *p, @@ -59,7 +59,7 @@ static void disable_wwnr(void) da_monitor_destroy_wwnr(); } -struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_wwnr = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, -- cgit v1.2.3 From 4359a011e259a4608afc7fb3635370c9d4ba5943 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Wed, 24 Aug 2022 11:43:57 +0800 Subject: rv/dot2K: add 'static' qualifier for local variable Following Daniel's suggestion, fix similar warning in template files, which would prevent new monitors from such warning. Link: https://lkml.kernel.org/r/20220824034357.2014202-3-zengheng4@huawei.com Cc: Fixes: 24bce201d798 ("tools/rv: Add dot2k") Suggested-by: Daniel Bristot de Oliveira Signed-off-by: Zeng Heng Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/dot2k_templates/main_global.c | 4 ++-- tools/verification/dot2/dot2k_templates/main_per_cpu.c | 4 ++-- tools/verification/dot2/dot2k_templates/main_per_task.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/verification/dot2/dot2k_templates/main_global.c b/tools/verification/dot2/dot2k_templates/main_global.c index f4b712dbc92e..dcd1162dced8 100644 --- a/tools/verification/dot2/dot2k_templates/main_global.c +++ b/tools/verification/dot2/dot2k_templates/main_global.c @@ -27,7 +27,7 @@ * * The rv monitor reference is needed for the monitor declaration. */ -struct rv_monitor rv_MODEL_NAME; +static struct rv_monitor rv_MODEL_NAME; DECLARE_DA_MON_GLOBAL(MODEL_NAME, MIN_TYPE); /* @@ -63,7 +63,7 @@ TRACEPOINT_DETACH /* * This is the monitor register section. */ -struct rv_monitor rv_MODEL_NAME = { +static struct rv_monitor rv_MODEL_NAME = { .name = "MODEL_NAME", .description = "auto-generated MODEL_NAME", .enable = enable_MODEL_NAME, diff --git a/tools/verification/dot2/dot2k_templates/main_per_cpu.c b/tools/verification/dot2/dot2k_templates/main_per_cpu.c index 4080d1ca3354..8f877e86a22f 100644 --- a/tools/verification/dot2/dot2k_templates/main_per_cpu.c +++ b/tools/verification/dot2/dot2k_templates/main_per_cpu.c @@ -27,7 +27,7 @@ * * The rv monitor reference is needed for the monitor declaration. */ -struct rv_monitor rv_MODEL_NAME; +static struct rv_monitor rv_MODEL_NAME; DECLARE_DA_MON_PER_CPU(MODEL_NAME, MIN_TYPE); /* @@ -63,7 +63,7 @@ TRACEPOINT_DETACH /* * This is the monitor register section. */ -struct rv_monitor rv_MODEL_NAME = { +static struct rv_monitor rv_MODEL_NAME = { .name = "MODEL_NAME", .description = "auto-generated MODEL_NAME", .enable = enable_MODEL_NAME, diff --git a/tools/verification/dot2/dot2k_templates/main_per_task.c b/tools/verification/dot2/dot2k_templates/main_per_task.c index 89197175384f..8c2fdb824634 100644 --- a/tools/verification/dot2/dot2k_templates/main_per_task.c +++ b/tools/verification/dot2/dot2k_templates/main_per_task.c @@ -27,7 +27,7 @@ * * The rv monitor reference is needed for the monitor declaration. */ -struct rv_monitor rv_MODEL_NAME; +static struct rv_monitor rv_MODEL_NAME; DECLARE_DA_MON_PER_TASK(MODEL_NAME, MIN_TYPE); /* @@ -63,7 +63,7 @@ TRACEPOINT_DETACH /* * This is the monitor register section. */ -struct rv_monitor rv_MODEL_NAME = { +static struct rv_monitor rv_MODEL_NAME = { .name = "MODEL_NAME", .description = "auto-generated MODEL_NAME", .enable = enable_MODEL_NAME, -- cgit v1.2.3 From b7b037eb5f548cc947e743d456d66eb110316f1e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 6 Sep 2022 18:53:14 -0400 Subject: tracing: Add numeric delta time to the trace event benchmark In order to testing filtering and histograms via the trace event benchmark, record the delta time of the last event as a numeric value (currently, it just saves it within the string) so that filters and histograms can use it. Link: https://lkml.kernel.org/r/20220906225529.213677569@goodmis.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_benchmark.c | 2 +- kernel/trace/trace_benchmark.h | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 801c2a7f7605..54d5fa35c90a 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -51,7 +51,7 @@ static void trace_do_benchmark(void) local_irq_disable(); start = trace_clock_local(); - trace_benchmark_event(bm_str); + trace_benchmark_event(bm_str, bm_last); stop = trace_clock_local(); local_irq_enable(); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 79e6fbe5b365..c3e91060dc94 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void); TRACE_EVENT_FN(benchmark_event, - TP_PROTO(const char *str), + TP_PROTO(const char *str, u64 delta), - TP_ARGS(str), + TP_ARGS(str, delta), TP_STRUCT__entry( __array( char, str, BENCHMARK_EVENT_STRLEN ) + __field( u64, delta) ), TP_fast_assign( memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + __entry->delta = delta; ), - TP_printk("%s", __entry->str), + TP_printk("%s delta=%llu", __entry->str, __entry->delta), trace_benchmark_reg, trace_benchmark_unreg ); -- cgit v1.2.3 From 86087383ec0a7ac2bcc3284e13e0e9966f7e3bfa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 6 Sep 2022 18:53:15 -0400 Subject: tracing/hist: Call hist functions directly via a switch statement Due to retpolines, indirect calls are much more expensive than direct calls. The histograms have a select set of functions it uses for the histograms, instead of using function pointers to call them, create a hist_fn_call() function that uses a switch statement to call the histogram functions directly. This gives a 13% speedup to the histogram logic. Using the histogram benchmark: Before: # event histogram # # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active] # { delta: 129 } hitcount: 2213 { delta: 130 } hitcount: 285965 { delta: 131 } hitcount: 1146545 { delta: 132 } hitcount: 5185432 { delta: 133 } hitcount: 19896215 { delta: 134 } hitcount: 53118616 { delta: 135 } hitcount: 83816709 { delta: 136 } hitcount: 68329562 { delta: 137 } hitcount: 41859349 { delta: 138 } hitcount: 46257797 { delta: 139 } hitcount: 54400831 { delta: 140 } hitcount: 72875007 { delta: 141 } hitcount: 76193272 { delta: 142 } hitcount: 49504263 { delta: 143 } hitcount: 38821072 { delta: 144 } hitcount: 47702679 { delta: 145 } hitcount: 41357297 { delta: 146 } hitcount: 22058238 { delta: 147 } hitcount: 9720002 { delta: 148 } hitcount: 3193542 { delta: 149 } hitcount: 927030 { delta: 150 } hitcount: 850772 { delta: 151 } hitcount: 1477380 { delta: 152 } hitcount: 2687977 { delta: 153 } hitcount: 2865985 { delta: 154 } hitcount: 1977492 { delta: 155 } hitcount: 2475607 { delta: 156 } hitcount: 3403612 After: # event histogram # # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active] # { delta: 113 } hitcount: 272 { delta: 114 } hitcount: 840 { delta: 118 } hitcount: 344 { delta: 119 } hitcount: 25428 { delta: 120 } hitcount: 350590 { delta: 121 } hitcount: 1892484 { delta: 122 } hitcount: 6205004 { delta: 123 } hitcount: 11583521 { delta: 124 } hitcount: 37590979 { delta: 125 } hitcount: 108308504 { delta: 126 } hitcount: 131672461 { delta: 127 } hitcount: 88700598 { delta: 128 } hitcount: 65939870 { delta: 129 } hitcount: 45055004 { delta: 130 } hitcount: 33174464 { delta: 131 } hitcount: 31813493 { delta: 132 } hitcount: 29011676 { delta: 133 } hitcount: 22798782 { delta: 134 } hitcount: 22072486 { delta: 135 } hitcount: 17034113 { delta: 136 } hitcount: 8982490 { delta: 137 } hitcount: 2865908 { delta: 138 } hitcount: 980382 { delta: 139 } hitcount: 1651944 { delta: 140 } hitcount: 4112073 { delta: 141 } hitcount: 3963269 { delta: 142 } hitcount: 1712508 { delta: 143 } hitcount: 575941 { delta: 144 } hitcount: 351427 { delta: 145 } hitcount: 218077 { delta: 146 } hitcount: 167297 { delta: 147 } hitcount: 146198 { delta: 148 } hitcount: 116122 { delta: 149 } hitcount: 58993 { delta: 150 } hitcount: 40228 The delta above is in nanoseconds. It brings the fastest time down from 129ns to 113ns, and the peak from 141ns to 126ns. Link: https://lkml.kernel.org/r/20220906225529.411545333@goodmis.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 246 +++++++++++++++++++++++++++------------ 1 file changed, 169 insertions(+), 77 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fdf784620c28..48465f7e97b4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -104,6 +104,38 @@ enum field_op_id { FIELD_OP_MULT, }; +enum hist_field_fn { + HIST_FIELD_FN_NOP, + HIST_FIELD_FN_VAR_REF, + HIST_FIELD_FN_COUNTER, + HIST_FIELD_FN_CONST, + HIST_FIELD_FN_LOG2, + HIST_FIELD_FN_BUCKET, + HIST_FIELD_FN_TIMESTAMP, + HIST_FIELD_FN_CPU, + HIST_FIELD_FN_STRING, + HIST_FIELD_FN_DYNSTRING, + HIST_FIELD_FN_RELDYNSTRING, + HIST_FIELD_FN_PSTRING, + HIST_FIELD_FN_S64, + HIST_FIELD_FN_U64, + HIST_FIELD_FN_S32, + HIST_FIELD_FN_U32, + HIST_FIELD_FN_S16, + HIST_FIELD_FN_U16, + HIST_FIELD_FN_S8, + HIST_FIELD_FN_U8, + HIST_FIELD_FN_UMINUS, + HIST_FIELD_FN_MINUS, + HIST_FIELD_FN_PLUS, + HIST_FIELD_FN_DIV, + HIST_FIELD_FN_MULT, + HIST_FIELD_FN_DIV_POWER2, + HIST_FIELD_FN_DIV_NOT_POWER2, + HIST_FIELD_FN_DIV_MULT_SHIFT, + HIST_FIELD_FN_EXECNAME, +}; + /* * A hist_var (histogram variable) contains variable information for * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF @@ -123,15 +155,15 @@ struct hist_var { struct hist_field { struct ftrace_event_field *field; unsigned long flags; - hist_field_fn_t fn; - unsigned int ref; - unsigned int size; - unsigned int offset; - unsigned int is_signed; unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + enum hist_field_fn fn_num; + unsigned int ref; + unsigned int size; + unsigned int offset; + unsigned int is_signed; /* * Variable fields contain variable-specific info in var. @@ -166,14 +198,11 @@ struct hist_field { u64 div_multiplier; }; -static u64 hist_field_none(struct hist_field *field, - struct tracing_map_elt *elt, - struct trace_buffer *buffer, - struct ring_buffer_event *rbe, - void *event) -{ - return 0; -} +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event); static u64 hist_field_const(struct hist_field *field, struct tracing_map_elt *elt, @@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } @@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field, struct hist_field *operand = hist_field->operands[0]; unsigned long buckets = hist_field->buckets; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); if (WARN_ON_ONCE(!buckets)) return val; @@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 + val2; } @@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 - val2; } @@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); /* Return -1 for the undefined case */ if (!val2) @@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return val1 >> __ffs64(operand2->constant); } @@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return div64_u64(val1, operand2->constant); } @@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); /* * If the divisor is a constant, do a multiplication and shift instead. @@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 * val2; } @@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); + s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -657,19 +686,19 @@ struct snapshot_context { * Returns the specific division function to use if the divisor * is constant. This avoids extra branches when the trigger is hit. */ -static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor) +static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor) { u64 div = divisor->constant; if (!(div & (div - 1))) - return div_by_power_of_two; + return HIST_FIELD_FN_DIV_POWER2; /* If the divisor is too large, do a regular division */ if (div > (1 << HIST_DIV_SHIFT)) - return div_by_not_power_of_two; + return HIST_FIELD_FN_DIV_NOT_POWER2; divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div); - return div_by_mult_and_shift; + return HIST_FIELD_FN_DIV_MULT_SHIFT; } static void track_data_free(struct track_data *track_data) @@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field, return field_name; } -static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +static enum hist_field_fn select_value_fn(int field_size, int field_is_signed) { - hist_field_fn_t fn = NULL; - switch (field_size) { case 8: if (field_is_signed) - fn = hist_field_s64; + return HIST_FIELD_FN_S64; else - fn = hist_field_u64; - break; + return HIST_FIELD_FN_U64; case 4: if (field_is_signed) - fn = hist_field_s32; + return HIST_FIELD_FN_S32; else - fn = hist_field_u32; - break; + return HIST_FIELD_FN_U32; case 2: if (field_is_signed) - fn = hist_field_s16; + return HIST_FIELD_FN_S16; else - fn = hist_field_u16; - break; + return HIST_FIELD_FN_U16; case 1: if (field_is_signed) - fn = hist_field_s8; + return HIST_FIELD_FN_S8; else - fn = hist_field_u8; - break; + return HIST_FIELD_FN_U8; } - return fn; + return HIST_FIELD_FN_NOP; } static int parse_map_size(char *str) @@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; /* caller will populate */ if (flags & HIST_FIELD_FL_VAR_REF) { - hist_field->fn = hist_field_var_ref; + hist_field->fn_num = HIST_FIELD_FN_VAR_REF; goto out; } if (flags & HIST_FIELD_FL_HITCOUNT) { - hist_field->fn = hist_field_counter; + hist_field->fn_num = HIST_FIELD_FN_COUNTER; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CONST) { - hist_field->fn = hist_field_const; + hist_field->fn_num = HIST_FIELD_FN_CONST; hist_field->size = sizeof(u64); hist_field->type = kstrdup("u64", GFP_KERNEL); if (!hist_field->type) @@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn = hist_field_none; + hist_field->fn_num = HIST_FIELD_FN_NOP; goto out; } if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); - hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : - hist_field_bucket; + hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : + HIST_FIELD_FN_BUCKET; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); @@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_TIMESTAMP) { - hist_field->fn = hist_field_timestamp; + hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { - hist_field->fn = hist_field_cpu; + hist_field->fn_num = HIST_FIELD_FN_CPU; hist_field->size = sizeof(int); hist_field->type = "unsigned int"; goto out; @@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto free; if (field->filter_type == FILTER_STATIC_STRING) { - hist_field->fn = hist_field_string; + hist_field->fn_num = HIST_FIELD_FN_STRING; hist_field->size = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - hist_field->fn = hist_field_dynstring; + hist_field->fn_num = HIST_FIELD_FN_DYNSTRING; } else if (field->filter_type == FILTER_RDYN_STRING) - hist_field->fn = hist_field_reldynstring; + hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else - hist_field->fn = hist_field_pstring; + hist_field->fn_num = HIST_FIELD_FN_PSTRING; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; @@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field->type) goto free; - hist_field->fn = select_value_fn(field->size, - field->is_signed); - if (!hist_field->fn) { + hist_field->fn_num = select_value_fn(field->size, + field->is_signed); + if (hist_field->fn_num == HIST_FIELD_FN_NOP) { destroy_hist_field(hist_field, 0); return NULL; } @@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, if (!alias) return NULL; - alias->fn = var_ref->fn; + alias->fn_num = var_ref->fn_num; alias->operands[0] = var_ref; if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { @@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); - expr->fn = hist_field_unary_minus; + expr->fn_num = HIST_FIELD_FN_UMINUS; expr->operands[0] = operand1; expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - hist_field_fn_t op_fn; + enum hist_field_fn op_fn; bool combine_consts; if (*n_subexprs > 3) { @@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, switch (field_op) { case FIELD_OP_MINUS: - op_fn = hist_field_minus; + op_fn = HIST_FIELD_FN_MINUS; break; case FIELD_OP_PLUS: - op_fn = hist_field_plus; + op_fn = HIST_FIELD_FN_PLUS; break; case FIELD_OP_DIV: - op_fn = hist_field_div; + op_fn = HIST_FIELD_FN_DIV; break; case FIELD_OP_MULT: - op_fn = hist_field_mult; + op_fn = HIST_FIELD_FN_MULT; break; default: ret = -EINVAL; @@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, op_fn = hist_field_get_div_fn(operand2); } + expr->fn_num = op_fn; + if (combine_consts) { if (var1) expr->operands[0] = var1; if (var2) expr->operands[1] = var2; - expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); + expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL); + expr->fn_num = HIST_FIELD_FN_CONST; expr->operands[0] = NULL; expr->operands[1] = NULL; @@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->name = expr_str(expr, 0); } else { - expr->fn = op_fn; - /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, buffer, rbe, rec); + var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + switch (hist_field->fn_num) { + case HIST_FIELD_FN_VAR_REF: + return hist_field_var_ref(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COUNTER: + return hist_field_counter(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CONST: + return hist_field_const(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_LOG2: + return hist_field_log2(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_BUCKET: + return hist_field_bucket(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_TIMESTAMP: + return hist_field_timestamp(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CPU: + return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STRING: + return hist_field_string(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DYNSTRING: + return hist_field_dynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_RELDYNSTRING: + return hist_field_reldynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PSTRING: + return hist_field_pstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S64: + return hist_field_s64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U64: + return hist_field_u64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S32: + return hist_field_s32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U32: + return hist_field_u32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S16: + return hist_field_s16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U16: + return hist_field_u16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S8: + return hist_field_s8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U8: + return hist_field_u8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_UMINUS: + return hist_field_unary_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MINUS: + return hist_field_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PLUS: + return hist_field_plus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV: + return hist_field_div(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MULT: + return hist_field_mult(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_POWER2: + return div_by_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_NOT_POWER2: + return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_MULT_SHIFT: + return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_EXECNAME: + return hist_field_execname(hist_field, elt, buffer, rbe, event); + default: + return 0; + } +} + /* Convert a var that points to common_pid.execname to a string */ static void update_var_execname(struct hist_field *hist_field) { @@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field) kfree_const(hist_field->type); hist_field->type = "char[]"; - hist_field->fn = hist_field_execname; + hist_field->fn_num = HIST_FIELD_FN_EXECNAME; } static int create_var_field(struct hist_trigger_data *hist_data, @@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } @@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; -- cgit v1.2.3 From 26c4e3d10ad0566f8604e6c378e939f941707f36 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 6 Sep 2022 18:53:16 -0400 Subject: tracing: Move struct filter_pred into trace_events_filter.c The structure filter_pred and the typedef of the function used are only referenced by trace_events_filter.c. There's no reason to have it in an external header file. Move them into the only file they are used in. Link: https://lkml.kernel.org/r/20220906225529.598047132@goodmis.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 13 ------------- kernel/trace/trace_events_filter.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 900e75d96c84..54ee5711c729 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file, struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); - typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { @@ -1455,17 +1453,6 @@ struct regex { regex_match_func match; }; -struct filter_pred { - filter_pred_fn_t fn; - u64 val; - struct regex regex; - unsigned short *ops; - struct ftrace_event_field *field; - int offset; - int not; - int op; -}; - static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4b1057ab9d96..c49c689ce4ad 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -43,6 +43,19 @@ enum filter_op_ids { OPS }; static const char * ops[] = { OPS }; +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); + +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + struct regex regex; + unsigned short *ops; + struct ftrace_event_field *field; + int offset; + int not; + int op; +}; + /* * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND * pred_funcs_##type below must match the order of them above. -- cgit v1.2.3 From fde59ab1614942a16dd0ce6f053ae96718f0ee50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 6 Sep 2022 18:53:17 -0400 Subject: tracing/filter: Call filter predicate functions directly via a switch statement Due to retpolines, indirect calls are much more expensive than direct calls. The filters have a select set of functions it uses for the predicates. Instead of using function pointers to call them, create a filter_pred_fn_call() function that uses a switch statement to call the predicate functions directly. This gives almost a 10% speedup to the filter logic. Using the histogram benchmark: Before: # event histogram # # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active] # { delta: 113 } hitcount: 272 { delta: 114 } hitcount: 840 { delta: 118 } hitcount: 344 { delta: 119 } hitcount: 25428 { delta: 120 } hitcount: 350590 { delta: 121 } hitcount: 1892484 { delta: 122 } hitcount: 6205004 { delta: 123 } hitcount: 11583521 { delta: 124 } hitcount: 37590979 { delta: 125 } hitcount: 108308504 { delta: 126 } hitcount: 131672461 { delta: 127 } hitcount: 88700598 { delta: 128 } hitcount: 65939870 { delta: 129 } hitcount: 45055004 { delta: 130 } hitcount: 33174464 { delta: 131 } hitcount: 31813493 { delta: 132 } hitcount: 29011676 { delta: 133 } hitcount: 22798782 { delta: 134 } hitcount: 22072486 { delta: 135 } hitcount: 17034113 { delta: 136 } hitcount: 8982490 { delta: 137 } hitcount: 2865908 { delta: 138 } hitcount: 980382 { delta: 139 } hitcount: 1651944 { delta: 140 } hitcount: 4112073 { delta: 141 } hitcount: 3963269 { delta: 142 } hitcount: 1712508 { delta: 143 } hitcount: 575941 After: # event histogram # # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active] # { delta: 103 } hitcount: 60 { delta: 104 } hitcount: 16966 { delta: 105 } hitcount: 396625 { delta: 106 } hitcount: 3223400 { delta: 107 } hitcount: 12053754 { delta: 108 } hitcount: 20241711 { delta: 109 } hitcount: 14850200 { delta: 110 } hitcount: 4946599 { delta: 111 } hitcount: 3479315 { delta: 112 } hitcount: 18698299 { delta: 113 } hitcount: 62388733 { delta: 114 } hitcount: 95803834 { delta: 115 } hitcount: 58278130 { delta: 116 } hitcount: 15364800 { delta: 117 } hitcount: 5586866 { delta: 118 } hitcount: 2346880 { delta: 119 } hitcount: 1131091 { delta: 120 } hitcount: 620896 { delta: 121 } hitcount: 236652 { delta: 122 } hitcount: 105957 { delta: 123 } hitcount: 119107 { delta: 124 } hitcount: 54494 { delta: 125 } hitcount: 63856 { delta: 126 } hitcount: 64454 { delta: 127 } hitcount: 34818 { delta: 128 } hitcount: 41446 { delta: 129 } hitcount: 51242 { delta: 130 } hitcount: 28361 { delta: 131 } hitcount: 23926 The peak before was 126ns per event, after the peak is 114ns, and the fastest time went from 113ns to 103ns. Link: https://lkml.kernel.org/r/20220906225529.781407172@goodmis.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 230 +++++++++++++++++++++++++------------ 1 file changed, 157 insertions(+), 73 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c49c689ce4ad..96acc2b71ac7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -43,10 +43,33 @@ enum filter_op_ids { OPS }; static const char * ops[] = { OPS }; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +enum filter_pred_fn { + FILTER_PRED_FN_NOP, + FILTER_PRED_FN_64, + FILTER_PRED_FN_S64, + FILTER_PRED_FN_U64, + FILTER_PRED_FN_32, + FILTER_PRED_FN_S32, + FILTER_PRED_FN_U32, + FILTER_PRED_FN_16, + FILTER_PRED_FN_S16, + FILTER_PRED_FN_U16, + FILTER_PRED_FN_8, + FILTER_PRED_FN_S8, + FILTER_PRED_FN_U8, + FILTER_PRED_FN_COMM, + FILTER_PRED_FN_STRING, + FILTER_PRED_FN_STRLOC, + FILTER_PRED_FN_STRRELLOC, + FILTER_PRED_FN_PCHAR_USER, + FILTER_PRED_FN_PCHAR, + FILTER_PRED_FN_CPU, + FILTER_PRED_FN_, + FILTER_PRED_TEST_VISITED, +}; struct filter_pred { - filter_pred_fn_t fn; + enum filter_pred_fn fn_num; u64 val; struct regex regex; unsigned short *ops; @@ -603,44 +626,48 @@ out_free: return ERR_PTR(ret); } +enum pred_cmp_types { + PRED_CMP_TYPE_NOP, + PRED_CMP_TYPE_LT, + PRED_CMP_TYPE_LE, + PRED_CMP_TYPE_GT, + PRED_CMP_TYPE_GE, + PRED_CMP_TYPE_BAND, +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr < val; \ -} \ -static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr <= val; \ -} \ -static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr > val; \ -} \ -static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr >= val; \ -} \ -static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return !!(*addr & val); \ -} \ -static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LE_##type, \ - filter_pred_LT_##type, \ - filter_pred_GE_##type, \ - filter_pred_GT_##type, \ - filter_pred_BAND_##type, \ -}; + switch (pred->op) { \ + case OP_LT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr < val; \ + } \ + case OP_LE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr <= val; \ + } \ + case OP_GT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr > val; \ + } \ + case OP_GE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr >= val; \ + } \ + case OP_BAND: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return !!(*addr & val); \ + } \ + default: \ + return 0; \ + } \ +} #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -849,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } -static int filter_pred_none(struct filter_pred *pred, void *event) -{ - return 0; -} - /* * regex_match_foo - Basic regex callbacks * @@ -999,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred) } } + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int test_pred_visited_fn(struct filter_pred *pred, void *event); +#else +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + return 0; +} +#endif + + +static int filter_pred_fn_call(struct filter_pred *pred, void *event); + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -1016,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; - int match = pred->fn(pred, rec); + int match = filter_pred_fn_call(pred, rec); if (match == prog[i].when_to_branch) i = prog[i].target; } @@ -1202,10 +1237,10 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, - int field_size, int field_is_signed) +static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { - filter_pred_fn_t fn = NULL; + enum filter_pred_fn fn = FILTER_PRED_FN_NOP; int pred_func_index = -1; switch (op) { @@ -1214,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, break; default: if (WARN_ON_ONCE(op < PRED_FUNC_START)) - return NULL; + return fn; pred_func_index = op - PRED_FUNC_START; if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) - return NULL; + return fn; } switch (field_size) { case 8: if (pred_func_index < 0) - fn = filter_pred_64; + fn = FILTER_PRED_FN_64; else if (field_is_signed) - fn = pred_funcs_s64[pred_func_index]; + fn = FILTER_PRED_FN_S64; else - fn = pred_funcs_u64[pred_func_index]; + fn = FILTER_PRED_FN_U64; break; case 4: if (pred_func_index < 0) - fn = filter_pred_32; + fn = FILTER_PRED_FN_32; else if (field_is_signed) - fn = pred_funcs_s32[pred_func_index]; + fn = FILTER_PRED_FN_S32; else - fn = pred_funcs_u32[pred_func_index]; + fn = FILTER_PRED_FN_U32; break; case 2: if (pred_func_index < 0) - fn = filter_pred_16; + fn = FILTER_PRED_FN_16; else if (field_is_signed) - fn = pred_funcs_s16[pred_func_index]; + fn = FILTER_PRED_FN_S16; else - fn = pred_funcs_u16[pred_func_index]; + fn = FILTER_PRED_FN_U16; break; case 1: if (pred_func_index < 0) - fn = filter_pred_8; + fn = FILTER_PRED_FN_8; else if (field_is_signed) - fn = pred_funcs_s8[pred_func_index]; + fn = FILTER_PRED_FN_S8; else - fn = pred_funcs_u8[pred_func_index]; + fn = FILTER_PRED_FN_U8; break; } return fn; } + +static int filter_pred_fn_call(struct filter_pred *pred, void *event) +{ + switch (pred->fn_num) { + case FILTER_PRED_FN_64: + return filter_pred_64(pred, event); + case FILTER_PRED_FN_S64: + return filter_pred_s64(pred, event); + case FILTER_PRED_FN_U64: + return filter_pred_u64(pred, event); + case FILTER_PRED_FN_32: + return filter_pred_32(pred, event); + case FILTER_PRED_FN_S32: + return filter_pred_s32(pred, event); + case FILTER_PRED_FN_U32: + return filter_pred_u32(pred, event); + case FILTER_PRED_FN_16: + return filter_pred_16(pred, event); + case FILTER_PRED_FN_S16: + return filter_pred_s16(pred, event); + case FILTER_PRED_FN_U16: + return filter_pred_u16(pred, event); + case FILTER_PRED_FN_8: + return filter_pred_8(pred, event); + case FILTER_PRED_FN_S8: + return filter_pred_s8(pred, event); + case FILTER_PRED_FN_U8: + return filter_pred_u8(pred, event); + case FILTER_PRED_FN_COMM: + return filter_pred_comm(pred, event); + case FILTER_PRED_FN_STRING: + return filter_pred_string(pred, event); + case FILTER_PRED_FN_STRLOC: + return filter_pred_strloc(pred, event); + case FILTER_PRED_FN_STRRELLOC: + return filter_pred_strrelloc(pred, event); + case FILTER_PRED_FN_PCHAR_USER: + return filter_pred_pchar_user(pred, event); + case FILTER_PRED_FN_PCHAR: + return filter_pred_pchar(pred, event); + case FILTER_PRED_FN_CPU: + return filter_pred_cpu(pred, event); + case FILTER_PRED_TEST_VISITED: + return test_pred_visited_fn(pred, event); + default: + return 0; + } +} + /* Called when a predicate is encountered by predicate_parse() */ static int parse_pred(const char *str, void *data, int pos, struct filter_parse_error *pe, @@ -1351,7 +1435,7 @@ static int parse_pred(const char *str, void *data, parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); goto err_free; } - pred->fn = filter_pred_none; + pred->fn_num = FILTER_PRED_FN_NOP; /* * Quotes are not required, but if they exist then we need @@ -1429,16 +1513,16 @@ static int parse_pred(const char *str, void *data, filter_build_regex(pred); if (field->filter_type == FILTER_COMM) { - pred->fn = filter_pred_comm; + pred->fn_num = FILTER_PRED_FN_COMM; } else if (field->filter_type == FILTER_STATIC_STRING) { - pred->fn = filter_pred_string; + pred->fn_num = FILTER_PRED_FN_STRING; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - pred->fn = filter_pred_strloc; + pred->fn_num = FILTER_PRED_FN_STRLOC; } else if (field->filter_type == FILTER_RDYN_STRING) - pred->fn = filter_pred_strrelloc; + pred->fn_num = FILTER_PRED_FN_STRRELLOC; else { if (!ustring_per_cpu) { @@ -1449,9 +1533,9 @@ static int parse_pred(const char *str, void *data, } if (ustring) - pred->fn = filter_pred_pchar_user; + pred->fn_num = FILTER_PRED_FN_PCHAR_USER; else - pred->fn = filter_pred_pchar; + pred->fn_num = FILTER_PRED_FN_PCHAR; } /* go past the last quote */ i++; @@ -1499,10 +1583,10 @@ static int parse_pred(const char *str, void *data, pred->val = val; if (field->filter_type == FILTER_CPU) - pred->fn = filter_pred_cpu; + pred->fn_num = FILTER_PRED_FN_CPU; else { - pred->fn = select_comparison_fn(pred->op, field->size, - field->is_signed); + pred->fn_num = select_comparison_fn(pred->op, field->size, + field->is_signed); if (pred->op == OP_NE) pred->not = 1; } @@ -2309,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; - WARN_ON_ONCE(!pred->fn); + WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP); if (!field) { WARN_ONCE(1, "all leafs should have field defined %d", i); @@ -2319,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) if (!strchr(fields, *field->name)) continue; - pred->fn = test_pred_visited_fn; + pred->fn_num = FILTER_PRED_TEST_VISITED; } } -- cgit v1.2.3 From 51714678eacc91fcfb8b235ced57ea70ade81cad Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 14 Sep 2022 14:14:16 +0800 Subject: tracepoint: Optimize the critical region of mutex_lock in tracepoint_module_coming() The memory allocation of 'tp_mod' does not require mutex_lock() protection, move it out. Link: https://lkml.kernel.org/r/20220914061416.1630-1-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Signed-off-by: Steven Rostedt (Google) --- kernel/tracepoint.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ef42c1a11920..f23144af5743 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; - int ret = 0; if (!mod->num_tracepoints) return 0; @@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod) */ if (trace_module_has_bad_taint(mod)) return 0; - mutex_lock(&tracepoint_module_list_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); - if (!tp_mod) { - ret = -ENOMEM; - goto end; - } + if (!tp_mod) + return -ENOMEM; tp_mod->mod = mod; + + mutex_lock(&tracepoint_module_list_mutex); list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); -end: mutex_unlock(&tracepoint_module_list_mutex); - return ret; + return 0; } static void tracepoint_module_going(struct module *mod) -- cgit v1.2.3 From 40d81137f186f188f9aa9081ff12018be6ee19d0 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Wed, 14 Sep 2022 19:04:36 +0800 Subject: x86/ftrace: Remove unused modifying_ftrace_code declaration All uses of modifying_ftrace_code have been removed by commit 768ae4406a5c ("x86/ftrace: Use text_poke()"), so remove the declaration, too. Link: https://lkml.kernel.org/r/20220914110437.1436353-2-cuigaosheng1@huawei.com Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b5ef474be858..908d99b127d3 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -23,7 +23,6 @@ #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR #ifndef __ASSEMBLY__ -extern atomic_t modifying_ftrace_code; extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) -- cgit v1.2.3 From d4940b84da4fbba7f7ed6e6287e1bef48d1293e9 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Wed, 14 Sep 2022 19:04:37 +0800 Subject: x86/kprobes: Remove unused arch_kprobe_override_function() declaration All uses of arch_kprobe_override_function() have been removed by commit 540adea3809f ("error-injection: Separate error-injection from kprobe"), so remove the declaration, too. Link: https://lkml.kernel.org/r/20220914110437.1436353-3-cuigaosheng1@huawei.com Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- arch/x86/include/asm/kprobes.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 71ea2eab43d5..a2e9317aad49 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -50,8 +50,6 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -extern void arch_kprobe_override_function(struct pt_regs *regs); - /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ -- cgit v1.2.3 From ac48e189527fae87253ef2bf58892e782fb36874 Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Mon, 19 Sep 2022 20:56:28 +0800 Subject: tracing: kprobe: Fix kprobe event gen test module on exit Correct gen_kretprobe_test clr event para on module exit. This will make it can't to delete. Link: https://lkml.kernel.org/r/20220919125629.238242-2-zouyipeng@huawei.com Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module") Signed-off-by: Yipeng Zou Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/kprobe_event_gen_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index 18b0f1cbb947..e023154be0f8 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -206,7 +206,7 @@ static void __exit kprobe_event_gen_test_exit(void) WARN_ON(kprobe_event_delete("gen_kprobe_test")); /* Disable the event or you can't remove it */ - WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, "kprobes", "gen_kretprobe_test", false)); -- cgit v1.2.3 From d8ef45d66c01425ff748e13ef7dd1da7a91cc93c Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Mon, 19 Sep 2022 20:56:29 +0800 Subject: tracing: kprobe: Make gen test module work in arm and riscv For now, this selftest module can only work in x86 because of the kprobe cmd was fixed use of x86 registers. This patch adapted to register names under arm and riscv, So that this module can be worked on those platform. Link: https://lkml.kernel.org/r/20220919125629.238242-3-zouyipeng@huawei.com Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module") Signed-off-by: Yipeng Zou Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/kprobe_event_gen_test.c | 47 +++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index e023154be0f8..80e04a1e1977 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -35,6 +35,45 @@ static struct trace_event_file *gen_kprobe_test; static struct trace_event_file *gen_kretprobe_test; +#define KPROBE_GEN_TEST_FUNC "do_sys_open" + +/* X86 */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32) +#define KPROBE_GEN_TEST_ARG0 "dfd=%ax" +#define KPROBE_GEN_TEST_ARG1 "filename=%dx" +#define KPROBE_GEN_TEST_ARG2 "flags=%cx" +#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)" + +/* ARM64 */ +#elif defined(CONFIG_ARM64) +#define KPROBE_GEN_TEST_ARG0 "dfd=%x0" +#define KPROBE_GEN_TEST_ARG1 "filename=%x1" +#define KPROBE_GEN_TEST_ARG2 "flags=%x2" +#define KPROBE_GEN_TEST_ARG3 "mode=%x3" + +/* ARM */ +#elif defined(CONFIG_ARM) +#define KPROBE_GEN_TEST_ARG0 "dfd=%r0" +#define KPROBE_GEN_TEST_ARG1 "filename=%r1" +#define KPROBE_GEN_TEST_ARG2 "flags=%r2" +#define KPROBE_GEN_TEST_ARG3 "mode=%r3" + +/* RISCV */ +#elif defined(CONFIG_RISCV) +#define KPROBE_GEN_TEST_ARG0 "dfd=%a0" +#define KPROBE_GEN_TEST_ARG1 "filename=%a1" +#define KPROBE_GEN_TEST_ARG2 "flags=%a2" +#define KPROBE_GEN_TEST_ARG3 "mode=%a3" + +/* others */ +#else +#define KPROBE_GEN_TEST_ARG0 NULL +#define KPROBE_GEN_TEST_ARG1 NULL +#define KPROBE_GEN_TEST_ARG2 NULL +#define KPROBE_GEN_TEST_ARG3 NULL +#endif + + /* * Test to make sure we can create a kprobe event, then add more * fields. @@ -58,14 +97,14 @@ static int __init test_gen_kprobe_cmd(void) * fields. */ ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test", - "do_sys_open", - "dfd=%ax", "filename=%dx"); + KPROBE_GEN_TEST_FUNC, + KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1); if (ret) goto free; /* Use kprobe_event_add_fields to add the rest of the fields */ - ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)"); + ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3); if (ret) goto free; @@ -128,7 +167,7 @@ static int __init test_gen_kretprobe_cmd(void) * Define the kretprobe event. */ ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test", - "do_sys_open", + KPROBE_GEN_TEST_FUNC, "$retval"); if (ret) goto free; -- cgit v1.2.3 From 99ee9317a1305cd5626736785c8cb38b0e47686c Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 19 Sep 2022 08:49:32 -0600 Subject: tracing/osnoise: Fix possible recursive locking in stop_per_cpu_kthreads There is a recursive lock on the cpu_hotplug_lock. In kernel/trace/trace_osnoise.c:_per_cpu_kthreads: - start_per_cpu_kthreads calls cpus_read_lock() and if start_kthreads returns a error it will call stop_per_cpu_kthreads. - stop_per_cpu_kthreads then calls cpus_read_lock() again causing deadlock. Fix this by calling cpus_read_unlock() before calling stop_per_cpu_kthreads. This behavior can also be seen in commit f46b16520a08 ("trace/hwlat: Implement the per-cpu mode"). This error was noticed during the LTP ftrace-stress-test: WARNING: possible recursive locking detected -------------------------------------------- sh/275006 is trying to acquire lock: ffffffffb02f5400 (cpu_hotplug_lock){++++}-{0:0}, at: stop_per_cpu_kthreads but task is already holding lock: ffffffffb02f5400 (cpu_hotplug_lock){++++}-{0:0}, at: start_per_cpu_kthreads other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(cpu_hotplug_lock); lock(cpu_hotplug_lock); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by sh/275006: #0: ffff8881023f0470 (sb_writers#24){.+.+}-{0:0}, at: ksys_write #1: ffffffffb084f430 (trace_types_lock){+.+.}-{3:3}, at: rb_simple_write #2: ffffffffb02f5400 (cpu_hotplug_lock){++++}-{0:0}, at: start_per_cpu_kthreads Link: https://lkml.kernel.org/r/20220919144932.3064014-1-npache@redhat.com Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Signed-off-by: Nico Pache Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 313439920a8c..78d536d3ff3d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void) for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); if (retval) { + cpus_read_unlock(); stop_per_cpu_kthreads(); - break; + return retval; } } -- cgit v1.2.3 From 834168fb2ce57681dee86a405ec560f54417830c Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 22 Sep 2022 18:32:08 +0800 Subject: rv/monitor: Add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220922103208.162869-1-xiujianfeng@huawei.com Fixes: 24bce201d798 ("tools/rv: Add dot2k") Fixes: 8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k") Fixes: ccc319dcb450 ("rv/monitor: Add the wwnr monitor") Signed-off-by: Xiu Jianfeng Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.c | 4 ++-- kernel/trace/rv/monitors/wwnr/wwnr.c | 4 ++-- tools/verification/dot2/dot2k_templates/main_global.c | 4 ++-- tools/verification/dot2/dot2k_templates/main_per_cpu.c | 4 ++-- tools/verification/dot2/dot2k_templates/main_per_task.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 1a989bc142f3..b2b49a27e886 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -69,13 +69,13 @@ static struct rv_monitor rv_wip = { .enabled = 0, }; -static int register_wip(void) +static int __init register_wip(void) { rv_register_monitor(&rv_wip); return 0; } -static void unregister_wip(void) +static void __exit unregister_wip(void) { rv_unregister_monitor(&rv_wip); } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index a063b93c6a1d..0e43dd2db685 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -68,13 +68,13 @@ static struct rv_monitor rv_wwnr = { .enabled = 0, }; -static int register_wwnr(void) +static int __init register_wwnr(void) { rv_register_monitor(&rv_wwnr); return 0; } -static void unregister_wwnr(void) +static void __exit unregister_wwnr(void) { rv_unregister_monitor(&rv_wwnr); } diff --git a/tools/verification/dot2/dot2k_templates/main_global.c b/tools/verification/dot2/dot2k_templates/main_global.c index dcd1162dced8..a5658bfb9044 100644 --- a/tools/verification/dot2/dot2k_templates/main_global.c +++ b/tools/verification/dot2/dot2k_templates/main_global.c @@ -72,13 +72,13 @@ static struct rv_monitor rv_MODEL_NAME = { .enabled = 0, }; -static int register_MODEL_NAME(void) +static int __init register_MODEL_NAME(void) { rv_register_monitor(&rv_MODEL_NAME); return 0; } -static void unregister_MODEL_NAME(void) +static void __exit unregister_MODEL_NAME(void) { rv_unregister_monitor(&rv_MODEL_NAME); } diff --git a/tools/verification/dot2/dot2k_templates/main_per_cpu.c b/tools/verification/dot2/dot2k_templates/main_per_cpu.c index 8f877e86a22f..03539a97633f 100644 --- a/tools/verification/dot2/dot2k_templates/main_per_cpu.c +++ b/tools/verification/dot2/dot2k_templates/main_per_cpu.c @@ -72,13 +72,13 @@ static struct rv_monitor rv_MODEL_NAME = { .enabled = 0, }; -static int register_MODEL_NAME(void) +static int __init register_MODEL_NAME(void) { rv_register_monitor(&rv_MODEL_NAME); return 0; } -static void unregister_MODEL_NAME(void) +static void __exit unregister_MODEL_NAME(void) { rv_unregister_monitor(&rv_MODEL_NAME); } diff --git a/tools/verification/dot2/dot2k_templates/main_per_task.c b/tools/verification/dot2/dot2k_templates/main_per_task.c index 8c2fdb824634..ffd92af87a86 100644 --- a/tools/verification/dot2/dot2k_templates/main_per_task.c +++ b/tools/verification/dot2/dot2k_templates/main_per_task.c @@ -72,13 +72,13 @@ static struct rv_monitor rv_MODEL_NAME = { .enabled = 0, }; -static int register_MODEL_NAME(void) +static int __init register_MODEL_NAME(void) { rv_register_monitor(&rv_MODEL_NAME); return 0; } -static void unregister_MODEL_NAME(void) +static void __exit unregister_MODEL_NAME(void) { rv_unregister_monitor(&rv_MODEL_NAME); } -- cgit v1.2.3 From c0a581d7126c0bbc96163276f585fd7b4e4d8d0e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 22 Sep 2022 10:56:22 -0400 Subject: tracing: Disable interrupt or preemption before acquiring arch_spinlock_t It was found that some tracing functions in kernel/trace/trace.c acquire an arch_spinlock_t with preemption and irqs enabled. An example is the tracing_saved_cmdlines_size_read() function which intermittently causes a "BUG: using smp_processor_id() in preemptible" warning when the LTP read_all_proc test is run. That can be problematic in case preemption happens after acquiring the lock. Add the necessary preemption or interrupt disabling code in the appropriate places before acquiring an arch_spinlock_t. The convention here is to disable preemption for trace_cmdline_lock and interupt for max_lock. Link: https://lkml.kernel.org/r/20220922145622.1744826-1-longman@redhat.com Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Boqun Feng Cc: stable@vger.kernel.org Fixes: a35873a0993b ("tracing: Add conditional snapshot") Fixes: 939c7a4f04fc ("tracing: Introduce saved_cmdlines_size file") Suggested-by: Steven Rostedt Signed-off-by: Waiman Long Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3005279165d..aed7ea6e6045 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return cond_data; } @@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); mutex_unlock(&trace_types_lock); @@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) @@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return ret; } @@ -2200,6 +2206,11 @@ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk) * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. */ + lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; @@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, char buf[64]; int r; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; } @@ -7436,10 +7457,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; -- cgit v1.2.3 From 3008119a3dd8052b7e5c07488e20a7abb0d287f2 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 23 Sep 2022 17:00:12 +0800 Subject: ftrace: Remove obsoleted code from ftrace and task_struct The trace of "struct task_struct" was no longer used since commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do"), and the functions about flags for current->trace is useless, so remove them. Link: https://lkml.kernel.org/r/20220923090012.505990-1-cuigaosheng1@huawei.com Signed-off-by: Gaosheng Cui Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 41 ----------------------------------------- include/linux/sched.h | 3 --- 2 files changed, 44 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0b61371e287b..62557d4bffc2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1122,47 +1122,6 @@ static inline void unpause_graph_tracing(void) { } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #ifdef CONFIG_TRACING - -/* flags for current->trace */ -enum { - TSK_TRACE_FL_TRACE_BIT = 0, - TSK_TRACE_FL_GRAPH_BIT = 1, -}; -enum { - TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT, - TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT, -}; - -static inline void set_tsk_trace_trace(struct task_struct *tsk) -{ - set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); -} - -static inline void clear_tsk_trace_trace(struct task_struct *tsk) -{ - clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); -} - -static inline int test_tsk_trace_trace(struct task_struct *tsk) -{ - return tsk->trace & TSK_TRACE_FL_TRACE; -} - -static inline void set_tsk_trace_graph(struct task_struct *tsk) -{ - set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); -} - -static inline void clear_tsk_trace_graph(struct task_struct *tsk) -{ - clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); -} - -static inline int test_tsk_trace_graph(struct task_struct *tsk) -{ - return tsk->trace & TSK_TRACE_FL_GRAPH; -} - enum ftrace_dump_mode; extern enum ftrace_dump_mode ftrace_dump_on_oops; diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..c7ee04e9147a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1381,9 +1381,6 @@ struct task_struct { #endif #ifdef CONFIG_TRACING - /* State flags for use by tracers: */ - unsigned long trace; - /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -- cgit v1.2.3 From ae398ad89456847aab9f12b0e63c51443af5da48 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Sat, 24 Sep 2022 15:26:29 +0800 Subject: x86: kprobes: Remove unused macro stack_addr An unused macro reported by [-Wunused-macros]. This macro is used to access the sp in pt_regs because at that time x86_32 can only get sp by kernel_stack_pointer(regs). '3c88c692c287 ("x86/stackframe/32: Provide consistent pt_regs")' This commit have unified the pt_regs and from them we can get sp from pt_regs with regs->sp easily. Nowhere is using this macro anymore. Refrencing pt_regs directly is more clear. Remove this macro for code cleaning. Link: https://lkml.kernel.org/r/20220924072629.104759-1-chenzhongjin@huawei.com Signed-off-by: Chen Zhongjin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- arch/x86/kernel/kprobes/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4c3c27b6aea3..eb8bc82846b9 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -59,8 +59,6 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)regs->sp) - #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ -- cgit v1.2.3 From dc399adecd4e2826868e5d116a58e33071b18346 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sat, 24 Sep 2022 22:13:34 +0800 Subject: tracing/eprobe: Fix alloc event dir failed when event name no set The event dir will alloc failed when event name no set, using the command: "echo "e:esys/ syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events" It seems that dir name="syscalls/sys_enter_openat" is not allowed in debugfs. So just use the "sys_enter_openat" as the event name. Link: https://lkml.kernel.org/r/1664028814-45923-1-git-send-email-chentao.kernel@linux.alibaba.com Cc: Ingo Molnar Cc: Tom Zanussi Cc: Linyu Yuan Cc: Tao Chen Signed-off-by: Tao Chen Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 78299d3724a2..c08bde9871ec 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -1039,8 +1039,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (!event) { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); + strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); event = buf1; } -- cgit v1.2.3 From 0ce0638edf5ec83343302b884fa208179580700a Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 26 Sep 2022 15:20:08 +0000 Subject: ftrace: Properly unset FTRACE_HASH_FL_MOD When executing following commands like what document said, but the log "#### all functions enabled ####" was not shown as expect: 1. Set a 'mod' filter: $ echo 'write*:mod:ext3' > /sys/kernel/tracing/set_ftrace_filter 2. Invert above filter: $ echo '!write*:mod:ext3' >> /sys/kernel/tracing/set_ftrace_filter 3. Read the file: $ cat /sys/kernel/tracing/set_ftrace_filter By some debugging, I found that flag FTRACE_HASH_FL_MOD was not unset after inversion like above step 2 and then result of ftrace_hash_empty() is incorrect. Link: https://lkml.kernel.org/r/20220926152008.2239274-1-zhengyejian1@huawei.com Cc: Cc: stable@vger.kernel.org Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 439e2ab6905e..5a1ec7e1af33 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6081,8 +6081,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (iter->tr && !list_empty(&iter->tr->mod_trace)) - iter->hash->flags |= FTRACE_HASH_FL_MOD; + if (iter->tr) { + if (list_empty(&iter->tr->mod_trace)) + iter->hash->flags &= ~FTRACE_HASH_FL_MOD; + else + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } } else orig_hash = &iter->ops->func_hash->notrace_hash; -- cgit v1.2.3 From 9d2ce78ddcee159eb6a97449e9c68b6d60b9cec4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 17:41:46 -0700 Subject: ftrace: Fix recursive locking direct_mutex in ftrace_modify_direct_caller Naveen reported recursive locking of direct_mutex with sample ftrace-direct-modify.ko: [ 74.762406] WARNING: possible recursive locking detected [ 74.762887] 6.0.0-rc6+ #33 Not tainted [ 74.763216] -------------------------------------------- [ 74.763672] event-sample-fn/1084 is trying to acquire lock: [ 74.764152] ffffffff86c9d6b0 (direct_mutex){+.+.}-{3:3}, at: \ register_ftrace_function+0x1f/0x180 [ 74.764922] [ 74.764922] but task is already holding lock: [ 74.765421] ffffffff86c9d6b0 (direct_mutex){+.+.}-{3:3}, at: \ modify_ftrace_direct+0x34/0x1f0 [ 74.766142] [ 74.766142] other info that might help us debug this: [ 74.766701] Possible unsafe locking scenario: [ 74.766701] [ 74.767216] CPU0 [ 74.767437] ---- [ 74.767656] lock(direct_mutex); [ 74.767952] lock(direct_mutex); [ 74.768245] [ 74.768245] *** DEADLOCK *** [ 74.768245] [ 74.768750] May be due to missing lock nesting notation [ 74.768750] [ 74.769332] 1 lock held by event-sample-fn/1084: [ 74.769731] #0: ffffffff86c9d6b0 (direct_mutex){+.+.}-{3:3}, at: \ modify_ftrace_direct+0x34/0x1f0 [ 74.770496] [ 74.770496] stack backtrace: [ 74.770884] CPU: 4 PID: 1084 Comm: event-sample-fn Not tainted ... [ 74.771498] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ... [ 74.772474] Call Trace: [ 74.772696] [ 74.772896] dump_stack_lvl+0x44/0x5b [ 74.773223] __lock_acquire.cold.74+0xac/0x2b7 [ 74.773616] lock_acquire+0xd2/0x310 [ 74.773936] ? register_ftrace_function+0x1f/0x180 [ 74.774357] ? lock_is_held_type+0xd8/0x130 [ 74.774744] ? my_tramp2+0x11/0x11 [ftrace_direct_modify] [ 74.775213] __mutex_lock+0x99/0x1010 [ 74.775536] ? register_ftrace_function+0x1f/0x180 [ 74.775954] ? slab_free_freelist_hook.isra.43+0x115/0x160 [ 74.776424] ? ftrace_set_hash+0x195/0x220 [ 74.776779] ? register_ftrace_function+0x1f/0x180 [ 74.777194] ? kfree+0x3e1/0x440 [ 74.777482] ? my_tramp2+0x11/0x11 [ftrace_direct_modify] [ 74.777941] ? __schedule+0xb40/0xb40 [ 74.778258] ? register_ftrace_function+0x1f/0x180 [ 74.778672] ? my_tramp1+0xf/0xf [ftrace_direct_modify] [ 74.779128] register_ftrace_function+0x1f/0x180 [ 74.779527] ? ftrace_set_filter_ip+0x33/0x70 [ 74.779910] ? __schedule+0xb40/0xb40 [ 74.780231] ? my_tramp1+0xf/0xf [ftrace_direct_modify] [ 74.780678] ? my_tramp2+0x11/0x11 [ftrace_direct_modify] [ 74.781147] ftrace_modify_direct_caller+0x5b/0x90 [ 74.781563] ? 0xffffffffa0201000 [ 74.781859] ? my_tramp1+0xf/0xf [ftrace_direct_modify] [ 74.782309] modify_ftrace_direct+0x1b2/0x1f0 [ 74.782690] ? __schedule+0xb40/0xb40 [ 74.783014] ? simple_thread+0x2a/0xb0 [ftrace_direct_modify] [ 74.783508] ? __schedule+0xb40/0xb40 [ 74.783832] ? my_tramp2+0x11/0x11 [ftrace_direct_modify] [ 74.784294] simple_thread+0x76/0xb0 [ftrace_direct_modify] [ 74.784766] kthread+0xf5/0x120 [ 74.785052] ? kthread_complete_and_exit+0x20/0x20 [ 74.785464] ret_from_fork+0x22/0x30 [ 74.785781] Fix this by using register_ftrace_function_nolock in ftrace_modify_direct_caller. Link: https://lkml.kernel.org/r/20220927004146.1215303-1-song@kernel.org Fixes: 53cd885bc5c3 ("ftrace: Allow IPMODIFY and DIRECT ops on the same function") Reported-and-tested-by: Naveen N. Rao Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5a1ec7e1af33..406d0597c409 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5427,6 +5427,8 @@ static struct ftrace_ops stub_ops = { * it is safe to modify the ftrace record, where it should be * currently calling @old_addr directly, to call @new_addr. * + * This is called with direct_mutex locked. + * * Safety checks should be made to make sure that the code at * @rec->ip is currently calling @old_addr. And this must * also update entry->direct to @new_addr. @@ -5439,6 +5441,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ip = rec->ip; int ret; + lockdep_assert_held(&direct_mutex); + /* * The ftrace_lock was used to determine if the record * had more than one registered user to it. If it did, @@ -5461,7 +5465,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, if (ret) goto out_lock; - ret = register_ftrace_function(&stub_ops); + ret = register_ftrace_function_nolock(&stub_ops); if (ret) { ftrace_set_filter_ip(&stub_ops, ip, 1, 0); goto out_lock; -- cgit v1.2.3 From fa8f4a89736b654125fb254b0db753ac68a5fced Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 27 Sep 2022 14:43:17 -0400 Subject: ring-buffer: Allow splice to read previous partially read pages If a page is partially read, and then the splice system call is run against the ring buffer, it will always fail to read, no matter how much is in the ring buffer. That's because the code path for a partial read of the page does will fail if the "full" flag is set. The splice system call wants full pages, so if the read of the ring buffer is not yet full, it should return zero, and the splice will block. But if a previous read was done, where the beginning has been consumed, it should still be given to the splice caller if the rest of the page has been written to. This caused the splice command to never consume data in this scenario, and let the ring buffer just fill up and lose events. Link: https://lkml.kernel.org/r/20220927144317.46be6b80@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 8789a9e7df6bf ("ring-buffer: read page interface") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d59b6a328b7f..6b145d48dfd1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5616,7 +5616,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int pos = 0; unsigned int size; - if (full) + /* + * If a full page is expected, this can still be returned + * if there's been a previous partial read and the + * rest of the page can be read and the commit page is off + * the reader page. + */ + if (full && + (!read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) -- cgit v1.2.3 From 3b19d614b61b93a131f463817e08219c9ce1fee3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 27 Sep 2022 19:15:24 -0400 Subject: ring-buffer: Have the shortest_full queue be the shortest not longest The logic to know when the shortest waiters on the ring buffer should be woken up or not has uses a less than instead of a greater than compare, which causes the shortest_full to actually be the longest. Link: https://lkml.kernel.org/r/20220927231823.718039222@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6b145d48dfd1..02db92c9eb1b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1011,7 +1011,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) nr_pages = cpu_buffer->nr_pages; dirty = ring_buffer_nr_dirty_pages(buffer, cpu); if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full < full) + cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (!pagebusy && -- cgit v1.2.3 From ec0bbc5ec5664dcee344f79373852117dc672c86 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 27 Sep 2022 19:15:25 -0400 Subject: ring-buffer: Check pending waiters when doing wake ups as well The wake up waiters only checks the "wakeup_full" variable and not the "full_waiters_pending". The full_waiters_pending is set when a waiter is added to the wait queue. The wakeup_full is only set when an event is triggered, and it clears the full_waiters_pending to avoid multiple calls to irq_work_queue(). The irq_work callback really needs to check both wakeup_full as well as full_waiters_pending such that this code can be used to wake up waiters when a file is closed that represents the ring buffer and the waiters need to be woken up. Link: https://lkml.kernel.org/r/20220927231824.209460321@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 02db92c9eb1b..5a7d818ca3ea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -917,8 +917,9 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); - if (rbwork->wakeup_full) { + if (rbwork->full_waiters_pending || rbwork->wakeup_full) { rbwork->wakeup_full = false; + rbwork->full_waiters_pending = false; wake_up_all(&rbwork->full_waiters); } } -- cgit v1.2.3 From 7e9fbbb1b776d8d7969551565bc246f74ec53b27 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 28 Sep 2022 13:39:38 -0400 Subject: ring-buffer: Add ring_buffer_wake_waiters() On closing of a file that represents a ring buffer or flushing the file, there may be waiters on the ring buffer that needs to be woken up and exit the ring_buffer_wait() function. Add ring_buffer_wake_waiters() to wake up the waiters on the ring buffer and allow them to exit the wait loop. Link: https://lkml.kernel.org/r/20220928133938.28dc2c27@gandalf.local.home Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code") Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dac53fd3afea..2504df9a0453 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); - +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu); #define RING_BUFFER_ALL_CPUS -1 diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5a7d818ca3ea..3046deacf7b3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -413,6 +413,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -924,6 +925,37 @@ static void rb_wake_up_waiters(struct irq_work *work) } } +/** + * ring_buffer_wake_waiters - wake up any waiters on this ring buffer + * @buffer: The ring buffer to wake waiters on + * + * In the case of a file that represents a ring buffer is closing, + * it is prudent to wake up any waiters that are on this. + */ +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *rbwork; + + if (cpu == RING_BUFFER_ALL_CPUS) { + + /* Wake up individual ones too. One level recursion */ + for_each_buffer_cpu(buffer, cpu) + ring_buffer_wake_waiters(buffer, cpu); + + rbwork = &buffer->irq_work; + } else { + cpu_buffer = buffer->buffers[cpu]; + rbwork = &cpu_buffer->irq_work; + } + + rbwork->wait_index++; + /* make sure the waiters see the new index */ + smp_wmb(); + + rb_wake_up_waiters(&rbwork->work); +} + /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on @@ -939,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; + long wait_index; int ret = 0; /* @@ -957,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) work = &cpu_buffer->irq_work; } + wait_index = READ_ONCE(work->wait_index); while (true) { if (full) @@ -1021,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) } schedule(); + + /* Make sure to see the new wait index */ + smp_rmb(); + if (wait_index != work->wait_index) + break; } if (full) -- cgit v1.2.3 From f3ddb74ad0790030c9592229fb14d8c451f4e9a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 27 Sep 2022 19:15:27 -0400 Subject: tracing: Wake up ring buffer waiters on closing of the file When the file that represents the ring buffer is closed, there may be waiters waiting on more input from the ring buffer. Call ring_buffer_wake_waiters() to wake up any waiters when the file is closed. Link: https://lkml.kernel.org/r/20220927231825.182416969@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8401dec93c15..20749bd9db71 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -92,6 +92,7 @@ struct trace_iterator { unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; + long wait_index; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aed7ea6e6045..e101b0764b39 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8160,6 +8160,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); @@ -8313,6 +8319,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + long wait_index; + if (ret) goto out; @@ -8320,10 +8328,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; + wait_index = READ_ONCE(iter->wait_index); + ret = wait_on_pipe(iter, iter->tr->buffer_percent); if (ret) goto out; + /* Make sure we see the new wait_index */ + smp_rmb(); + if (wait_index != iter->wait_index) + goto out; + goto again; } -- cgit v1.2.3 From 01b2a52171735c6eea80ee2f355f32bea6c41418 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 29 Sep 2022 09:50:29 -0400 Subject: tracing: Add ioctl() to force ring buffer waiters to wake up If a process is waiting on the ring buffer for data, there currently isn't a clean way to force it to wake up. Add an ioctl call that will force any tasks that are waiting on the trace_pipe_raw file to wake up. Link: https://lkml.kernel.org/r/20220929095029.117f913f@gandalf.local.home Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e101b0764b39..58afc83afc9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8349,12 +8349,34 @@ out: return ret; } +/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ +static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + if (cmd) + return -ENOIOCTLCMD; + + mutex_lock(&trace_types_lock); + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + mutex_unlock(&trace_types_lock); + return 0; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, + .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, }; -- cgit v1.2.3 From 2b0fd9a59b7990c161fa1cb7b79edb22847c87c2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 28 Sep 2022 18:22:20 -0400 Subject: tracing: Wake up waiters when tracing is disabled When tracing is disabled, there's no reason that waiters should stay waiting, wake them up, otherwise tasks get stuck when they should be flushing the buffers. Cc: stable@vger.kernel.org Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 58afc83afc9d..bb5597c6bfc1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8334,6 +8334,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (ret) goto out; + /* No need to wait after waking up when tracing is off */ + if (!tracer_tracing_is_on(iter->tr)) + goto out; + /* Make sure we see the new wait_index */ smp_rmb(); if (wait_index != iter->wait_index) @@ -9065,6 +9069,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf, tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); + /* Wake up any waiters */ + ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } mutex_unlock(&trace_types_lock); } -- cgit v1.2.3 From e841e8bfac490957fed3157326b96342dc76798a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Sep 2022 22:58:28 +0100 Subject: tracing: Fix spelling mistake "preapre" -> "prepare" There is a spelling mistake in the trace text. Fix it. Link: https://lkml.kernel.org/r/20220928215828.66325-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb5597c6bfc1..def721de68a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10157,7 +10157,7 @@ __init static int tracer_alloc_buffers(void) * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, - "trace/RB:preapre", trace_rb_cpu_prepare, + "trace/RB:prepare", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; -- cgit v1.2.3 From 9cbf12343d595a1a5ecc84c4471b1fe52e2be19b Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:04 -0700 Subject: tracing/user_events: Use NULL for strstr checks Trivial fix to ensure strstr checks use NULL instead of 0. Link: https://lkml.kernel.org/r/20220728233309.1896-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index a6621c52ce45..075d694d20e3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -277,7 +277,7 @@ static int user_event_add_field(struct user_event *user, const char *type, goto add_field; add_validator: - if (strstr(type, "char") != 0) + if (strstr(type, "char") != NULL) validator_flags |= VALIDATOR_ENSURE_NULL; validator = kmalloc(sizeof(*validator), GFP_KERNEL); @@ -458,7 +458,7 @@ static const char *user_field_format(const char *type) return "%d"; if (strcmp(type, "unsigned char") == 0) return "%u"; - if (strstr(type, "char[") != 0) + if (strstr(type, "char[") != NULL) return "%s"; /* Unknown, likely struct, allowed treat as 64-bit */ @@ -479,7 +479,7 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func) return false; check: - return strstr(type, "char") != 0; + return strstr(type, "char") != NULL; } #define LEN_OR_ZERO (len ? len - pos : 0) -- cgit v1.2.3 From 95f187603dbff69bef19b2ab3bb54d2c060f556d Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:05 -0700 Subject: tracing/user_events: Use WRITE instead of READ for io vector import import_single_range expects the direction/rw to be where it came from, not the protection/limit. Since the import is in a write path use WRITE. Link: https://lkml.kernel.org/r/20220728233309.1896-3-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Reported-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 075d694d20e3..15edbf6b1e2e 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1245,7 +1245,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + if (unlikely(import_single_range(WRITE, (char __user *)ubuf, + count, &iov, &i))) return -EFAULT; return user_events_write_core(file, &i); -- cgit v1.2.3 From e6f89a149872ab0e03cfded97983df74dfb0ef21 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:06 -0700 Subject: tracing/user_events: Ensure user provided strings are safely formatted User processes can provide bad strings that may cause issues or leak kernel details back out. Don't trust the content of these strings when formatting strings for matching. This also moves to a consistent dynamic length string creation model. Link: https://lkml.kernel.org/r/20220728233309.1896-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Reported-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 91 ++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 15edbf6b1e2e..f9bb7d37d76f 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -45,7 +45,6 @@ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 -#define MAX_FIELD_ARG_NAME 256 static char *register_page_data; @@ -483,6 +482,48 @@ check: } #define LEN_OR_ZERO (len ? len - pos : 0) +static int user_dyn_field_set_string(int argc, const char **argv, int *iout, + char *buf, int len, bool *colon) +{ + int pos = 0, i = *iout; + + *colon = false; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + *colon = true; + break; + } + } + + /* Actual set, advance i */ + if (len != 0) + *iout = i; + + return pos + 1; +} + +static int user_field_set_string(struct ftrace_event_field *field, + char *buf, int len, bool colon) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + + if (colon) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + return pos + 1; +} + static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { struct ftrace_event_field *field, *next; @@ -926,49 +967,35 @@ static int user_event_free(struct dyn_event *ev) static bool user_field_match(struct ftrace_event_field *field, int argc, const char **argv, int *iout) { - char *field_name, *arg_name; - int len, pos, i = *iout; + char *field_name = NULL, *dyn_field_name = NULL; bool colon = false, match = false; + int dyn_len, len; - if (i >= argc) + if (*iout >= argc) return false; - len = MAX_FIELD_ARG_NAME; - field_name = kmalloc(len, GFP_KERNEL); - arg_name = kmalloc(len, GFP_KERNEL); + dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + 0, &colon); - if (!arg_name || !field_name) - goto out; - - pos = 0; - - for (; i < argc; ++i) { - if (i != *iout) - pos += snprintf(arg_name + pos, len - pos, " "); + len = user_field_set_string(field, field_name, 0, colon); - pos += snprintf(arg_name + pos, len - pos, argv[i]); - - if (strchr(argv[i], ';')) { - ++i; - colon = true; - break; - } - } + if (dyn_len != len) + return false; - pos = 0; + dyn_field_name = kmalloc(dyn_len, GFP_KERNEL); + field_name = kmalloc(len, GFP_KERNEL); - pos += snprintf(field_name + pos, len - pos, field->type); - pos += snprintf(field_name + pos, len - pos, " "); - pos += snprintf(field_name + pos, len - pos, field->name); + if (!dyn_field_name || !field_name) + goto out; - if (colon) - pos += snprintf(field_name + pos, len - pos, ";"); + user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + dyn_len, &colon); - *iout = i; + user_field_set_string(field, field_name, len, colon); - match = strcmp(arg_name, field_name) == 0; + match = strcmp(dyn_field_name, field_name) == 0; out: - kfree(arg_name); + kfree(dyn_field_name); kfree(field_name); return match; -- cgit v1.2.3 From d401b72458562c2f2a81dad162de5c1b8e191e17 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:07 -0700 Subject: tracing/user_events: Use refcount instead of atomic for ref tracking User processes could open up enough event references to cause rollovers. These could cause use after free scenarios, which we do not want. Switching to refcount APIs prevent this, but will leak memory once saturated. Once saturated, user processes can still use the events. This prevents a bad user process from stopping existing telemetry from being emitted. Link: https://lkml.kernel.org/r/20220728233309.1896-5-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Reported-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 53 ++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index f9bb7d37d76f..2bcae7abfa81 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -57,7 +58,7 @@ static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); * within a file a user_event might be created if it does not * already exist. These are globally used and their lifetime * is tied to the refcnt member. These cannot go away until the - * refcnt reaches zero. + * refcnt reaches one. */ struct user_event { struct tracepoint tracepoint; @@ -67,7 +68,7 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; - atomic_t refcnt; + refcount_t refcnt; int index; int flags; int min_size; @@ -105,6 +106,12 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static __always_inline __must_check +bool user_event_last_ref(struct user_event *user) +{ + return refcount_read(&user->refcnt) == 1; +} + static __always_inline __must_check size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) { @@ -662,7 +669,7 @@ static struct user_event *find_user_event(char *name, u32 *outkey) hash_for_each_possible(register_table, user, node, key) if (!strcmp(EVENT_NAME(user), name)) { - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); return user; } @@ -876,12 +883,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); update_reg_page_for(user); return 0; dec: update_reg_page_for(user); - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); return 0; } @@ -907,7 +914,7 @@ static int user_event_create(const char *raw_command) ret = user_event_parse_cmd(name, &user); if (!ret) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); mutex_unlock(®_mutex); @@ -951,14 +958,14 @@ static bool user_event_is_busy(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - return atomic_read(&user->refcnt) != 0; + return !user_event_last_ref(user); } static int user_event_free(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - if (atomic_read(&user->refcnt) != 0) + if (!user_event_last_ref(user)) return -EBUSY; return destroy_user_event(user); @@ -1137,8 +1144,8 @@ static int user_event_parse(char *name, char *args, char *flags, user->index = index; - /* Ensure we track ref */ - atomic_inc(&user->refcnt); + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -1164,29 +1171,17 @@ put_user: static int delete_user_event(char *name) { u32 key; - int ret; struct user_event *user = find_user_event(name, &key); if (!user) return -ENOENT; - /* Ensure we are the last ref */ - if (atomic_read(&user->refcnt) != 1) { - ret = -EBUSY; - goto put_ref; - } - - ret = destroy_user_event(user); + refcount_dec(&user->refcnt); - if (ret) - goto put_ref; - - return ret; -put_ref: - /* No longer have this ref */ - atomic_dec(&user->refcnt); + if (!user_event_last_ref(user)) + return -EBUSY; - return ret; + return destroy_user_event(user); } /* @@ -1314,7 +1309,7 @@ static int user_events_ref_add(struct file *file, struct user_event *user) new_refs->events[i] = user; - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); rcu_assign_pointer(file->private_data, new_refs); @@ -1374,7 +1369,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) ret = user_events_ref_add(file, user); /* No longer need parse ref, ref_add either worked or not */ - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); /* Positive number is index and valid */ if (ret < 0) @@ -1464,7 +1459,7 @@ static int user_events_release(struct inode *node, struct file *file) user = refs->events[i]; if (user) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); } out: file->private_data = NULL; -- cgit v1.2.3 From 39d6d08b2edf99c4b39a689a70bf0adee065b357 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:08 -0700 Subject: tracing/user_events: Use bits vs bytes for enabled status page data User processes may require many events and when they do the cache performance of a byte index status check is less ideal than a bit index. The previous event limit per-page was 4096, the new limit is 32,768. This change adds a bitwise index to the user_reg struct. Programs check that the bit at status_bit has a bit set within the status page(s). Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Suggested-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 15 +---- kernel/trace/trace_events_user.c | 75 ++++++++++++++++++++--- samples/user_events/example.c | 25 +++++--- tools/testing/selftests/user_events/ftrace_test.c | 47 +++++++++++--- tools/testing/selftests/user_events/perf_test.c | 11 +++- 5 files changed, 135 insertions(+), 38 deletions(-) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 736e05603463..592a3fbed98e 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -20,15 +20,6 @@ #define USER_EVENTS_SYSTEM "user_events" #define USER_EVENTS_PREFIX "u:" -/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ -#define EVENT_BIT_FTRACE 0 -#define EVENT_BIT_PERF 1 -#define EVENT_BIT_OTHER 7 - -#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) -#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) -#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) - /* Create dynamic location entry within a 32-bit value */ #define DYN_LOC(offset, size) ((size) << 16 | (offset)) @@ -45,12 +36,12 @@ struct user_reg { /* Input: Pointer to string with event name, description and flags */ __u64 name_args; - /* Output: Byte index of the event within the status page */ - __u32 status_index; + /* Output: Bitwise index of the event within the status page */ + __u32 status_bit; /* Output: Index of the event to use when writing data */ __u32 write_index; -}; +} __attribute__((__packed__)); #define DIAG_IOC_MAGIC '*' diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 2bcae7abfa81..2c0a6ec75548 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -40,17 +40,44 @@ */ #define MAX_PAGE_ORDER 0 #define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) +#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) +#define MAX_EVENTS (MAX_BYTES * 8) /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 +/* + * The MAP_STATUS_* macros are used for taking a index and determining the + * appropriate byte and the bit in the byte to set/reset for an event. + * + * The lower 3 bits of the index decide which bit to set. + * The remaining upper bits of the index decide which byte to use for the bit. + * + * This is used when an event has a probe attached/removed to reflect live + * status of the event wanting tracing or not to user-programs via shared + * memory maps. + */ +#define MAP_STATUS_BYTE(index) ((index) >> 3) +#define MAP_STATUS_MASK(index) BIT((index) & 7) + +/* + * Internal bits (kernel side only) to keep track of connected probes: + * These are used when status is requested in text form about an event. These + * bits are compared against an internal byte on the event to determine which + * probes to print out to the user. + * + * These do not reflect the mapped bytes between the user and kernel space. + */ +#define EVENT_STATUS_FTRACE BIT(0) +#define EVENT_STATUS_PERF BIT(1) +#define EVENT_STATUS_OTHER BIT(7) + static char *register_page_data; static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 4); +static DEFINE_HASHTABLE(register_table, 8); static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); /* @@ -72,6 +99,7 @@ struct user_event { int index; int flags; int min_size; + char status; }; /* @@ -106,6 +134,22 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static __always_inline +void user_event_register_set(struct user_event *user) +{ + int i = user->index; + + register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); +} + +static __always_inline +void user_event_register_clear(struct user_event *user) +{ + int i = user->index; + + register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); +} + static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { @@ -648,7 +692,7 @@ static int destroy_user_event(struct user_event *user) dyn_event_remove(&user->devent); - register_page_data[user->index] = 0; + user_event_register_clear(user); clear_bit(user->index, page_bitmap); hash_del(&user->node); @@ -827,7 +871,12 @@ static void update_reg_page_for(struct user_event *user) rcu_read_unlock_sched(); } - register_page_data[user->index] = status; + if (status) + user_event_register_set(user); + else + user_event_register_clear(user); + + user->status = status; } /* @@ -1332,7 +1381,17 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (size > PAGE_SIZE) return -E2BIG; - return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + if (size < offsetofend(struct user_reg, write_index)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + if (ret) + return ret; + + kreg->size = size; + + return 0; } /* @@ -1376,7 +1435,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) return ret; put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_index); + put_user(user->index, &ureg->status_bit); return 0; } @@ -1485,7 +1544,7 @@ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) { unsigned long size = vma->vm_end - vma->vm_start; - if (size != MAX_EVENTS) + if (size != MAX_BYTES) return -EINVAL; return remap_pfn_range(vma, vma->vm_start, @@ -1520,7 +1579,7 @@ static int user_seq_show(struct seq_file *m, void *p) mutex_lock(®_mutex); hash_for_each(register_table, i, user, node) { - status = register_page_data[user->index]; + status = user->status; flags = user->flags; seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); diff --git a/samples/user_events/example.c b/samples/user_events/example.c index 4f5778e441c0..d06dc24156ec 100644 --- a/samples/user_events/example.c +++ b/samples/user_events/example.c @@ -12,13 +12,21 @@ #include #include #include +#include +#include #include +#if __BITS_PER_LONG == 64 +#define endian_swap(x) htole64(x) +#else +#define endian_swap(x) htole32(x) +#endif + /* Assumes debugfs is mounted */ const char *data_file = "/sys/kernel/debug/tracing/user_events_data"; const char *status_file = "/sys/kernel/debug/tracing/user_events_status"; -static int event_status(char **status) +static int event_status(long **status) { int fd = open(status_file, O_RDONLY); @@ -33,7 +41,8 @@ static int event_status(char **status) return 0; } -static int event_reg(int fd, const char *command, int *status, int *write) +static int event_reg(int fd, const char *command, long *index, long *mask, + int *write) { struct user_reg reg = {0}; @@ -43,7 +52,8 @@ static int event_reg(int fd, const char *command, int *status, int *write) if (ioctl(fd, DIAG_IOCSREG, ®) == -1) return -1; - *status = reg.status_index; + *index = reg.status_bit / __BITS_PER_LONG; + *mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG)); *write = reg.write_index; return 0; @@ -51,8 +61,9 @@ static int event_reg(int fd, const char *command, int *status, int *write) int main(int argc, char **argv) { - int data_fd, status, write; - char *status_page; + int data_fd, write; + long index, mask; + long *status_page; struct iovec io[2]; __u32 count = 0; @@ -61,7 +72,7 @@ int main(int argc, char **argv) data_fd = open(data_file, O_RDWR); - if (event_reg(data_fd, "test u32 count", &status, &write) == -1) + if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1) return errno; /* Setup iovec */ @@ -75,7 +86,7 @@ ask: getchar(); /* Check if anyone is listening */ - if (status_page[status]) { + if (status_page[index] & mask) { /* Yep, trace out our data */ writev(data_fd, (const struct iovec *)io, 2); diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index a80fb5ef61d5..404a2713dcae 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e const char *trace_file = "/sys/kernel/debug/tracing/trace"; const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format"; +static inline int status_check(char *status_page, int status_bit) +{ + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); +} + static int trace_bytes(void) { int fd = open(trace_file, O_RDONLY); @@ -197,12 +202,12 @@ TEST_F(user, register_events) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); /* Multiple registers should result in same index */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); /* Ensure disabled */ self->enable_fd = open(enable_file, O_RDWR); @@ -212,15 +217,15 @@ TEST_F(user, register_events) { /* MMAP should work and be zero'd */ ASSERT_NE(MAP_FAILED, status_page); ASSERT_NE(NULL, status_page); - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* Enable event and ensure bits updated in status */ ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) - ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]); + ASSERT_NE(0, status_check(status_page, reg.status_bit)); /* Disable event and ensure bits updated in status */ ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0"))) - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* File still open should return -EBUSY for delete */ ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); @@ -240,6 +245,8 @@ TEST_F(user, write_events) { struct iovec io[3]; __u32 field1, field2; int before = 0, after = 0; + int page_size = sysconf(_SC_PAGESIZE); + char *status_page; reg.size = sizeof(reg); reg.name_args = (__u64)"__test_event u32 field1; u32 field2"; @@ -254,10 +261,18 @@ TEST_F(user, write_events) { io[2].iov_base = &field2; io[2].iov_len = sizeof(field2); + status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, + self->status_fd, 0); + /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); + + /* MMAP should work and be zero'd */ + ASSERT_NE(MAP_FAILED, status_page); + ASSERT_NE(NULL, status_page); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* Write should fail on invalid slot with ENOENT */ io[0].iov_base = &field2; @@ -271,6 +286,9 @@ TEST_F(user, write_events) { self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + /* Event should now be enabled */ + ASSERT_NE(0, status_check(status_page, reg.status_bit)); + /* Write should make it out to ftrace buffers */ before = trace_bytes(); ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3)); @@ -298,7 +316,7 @@ TEST_F(user, write_fault) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); /* Write should work normally */ ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2)); @@ -315,6 +333,11 @@ TEST_F(user, write_validator) { int loc, bytes; char data[8]; int before = 0, after = 0; + int page_size = sysconf(_SC_PAGESIZE); + char *status_page; + + status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, + self->status_fd, 0); reg.size = sizeof(reg); reg.name_args = (__u64)"__test_event __rel_loc char[] data"; @@ -322,7 +345,12 @@ TEST_F(user, write_validator) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); + + /* MMAP should work and be zero'd */ + ASSERT_NE(MAP_FAILED, status_page); + ASSERT_NE(NULL, status_page); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); io[0].iov_base = ®.write_index; io[0].iov_len = sizeof(reg.write_index); @@ -340,6 +368,9 @@ TEST_F(user, write_validator) { self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + /* Event should now be enabled */ + ASSERT_NE(0, status_check(status_page, reg.status_bit)); + /* Full in-bounds write should work */ before = trace_bytes(); loc = DYN_LOC(0, bytes); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 26851d51d6bb..8b4c7879d5a7 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid, return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags); } +static inline int status_check(char *status_page, int status_bit) +{ + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); +} + static int get_id(void) { FILE *fp = fopen(id_file, "r"); @@ -120,8 +125,8 @@ TEST_F(user, perf_write) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_NE(0, reg.status_bit); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* Id should be there */ id = get_id(); @@ -144,7 +149,7 @@ TEST_F(user, perf_write) { ASSERT_NE(MAP_FAILED, perf_page); /* Status should be updated */ - ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]); + ASSERT_NE(0, status_check(status_page, reg.status_bit)); event.index = reg.write_index; event.field1 = 0xc001; -- cgit v1.2.3 From 933678b6183bbe7afa332e70132065db3305ee44 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 28 Jul 2022 16:33:09 -0700 Subject: tracing/user_events: Update ABI documentation to align to bits vs bytes Update the documentation to reflect the new ABI requirements and how to use the byte index with the mask properly to check event status. Link: https://lkml.kernel.org/r/20220728233309.1896-7-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/user_events.rst | 86 +++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index c180936f49fc..9f181f342a70 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -20,14 +20,14 @@ dynamic_events is the same as the ioctl with the u: prefix applied. Typically programs will register a set of events that they wish to expose to tools that can read trace_events (such as ftrace and perf). The registration -process gives back two ints to the program for each event. The first int is the -status index. This index describes which byte in the +process gives back two ints to the program for each event. The first int is +the status bit. This describes which bit in little-endian format in the /sys/kernel/debug/tracing/user_events_status file represents this event. The -second int is the write index. This index describes the data when a write() or +second int is the write index which describes the data when a write() or writev() is called on the /sys/kernel/debug/tracing/user_events_data file. -The structures referenced in this document are contained with the -/include/uap/linux/user_events.h file in the source tree. +The structures referenced in this document are contained within the +/include/uapi/linux/user_events.h file in the source tree. **NOTE:** *Both user_events_status and user_events_data are under the tracefs filesystem and may be mounted at different paths than above.* @@ -38,18 +38,18 @@ Registering within a user process is done via ioctl() out to the /sys/kernel/debug/tracing/user_events_data file. The command to issue is DIAG_IOCSREG. -This command takes a struct user_reg as an argument:: +This command takes a packed struct user_reg as an argument:: struct user_reg { u32 size; u64 name_args; - u32 status_index; + u32 status_bit; u32 write_index; }; The struct user_reg requires two inputs, the first is the size of the structure to ensure forward and backward compatibility. The second is the command string -to issue for registering. Upon success two outputs are set, the status index +to issue for registering. Upon success two outputs are set, the status bit and the write index. User based events show up under tracefs like any other event under the @@ -111,15 +111,56 @@ in realtime. This allows user programs to only incur the cost of the write() or writev() calls when something is actively attached to the event. User programs call mmap() on /sys/kernel/debug/tracing/user_events_status to -check the status for each event that is registered. The byte to check in the -file is given back after the register ioctl() via user_reg.status_index. +check the status for each event that is registered. The bit to check in the +file is given back after the register ioctl() via user_reg.status_bit. The bit +is always in little-endian format. Programs can check if the bit is set either +using a byte-wise index with a mask or a long-wise index with a little-endian +mask. + Currently the size of user_events_status is a single page, however, custom kernel configurations can change this size to allow more user based events. In all cases the size of the file is a multiple of a page size. -For example, if the register ioctl() gives back a status_index of 3 you would -check byte 3 of the returned mmap data to see if anything is attached to that -event. +For example, if the register ioctl() gives back a status_bit of 3 you would +check byte 0 (3 / 8) of the returned mmap data and then AND the result with 8 +(1 << (3 % 8)) to see if anything is attached to that event. + +A byte-wise index check is performed as follows:: + + int index, mask; + char *status_page; + + index = status_bit / 8; + mask = 1 << (status_bit % 8); + + ... + + if (status_page[index] & mask) { + /* Enabled */ + } + +A long-wise index check is performed as follows:: + + #include + #include + + #if __BITS_PER_LONG == 64 + #define endian_swap(x) htole64(x) + #else + #define endian_swap(x) htole32(x) + #endif + + long index, mask, *status_page; + + index = status_bit / __BITS_PER_LONG; + mask = 1L << (status_bit % __BITS_PER_LONG); + mask = endian_swap(mask); + + ... + + if (status_page[index] & mask) { + /* Enabled */ + } Administrators can easily check the status of all registered events by reading the user_events_status file directly via a terminal. The output is as follows:: @@ -137,7 +178,7 @@ For example, on a system that has a single event the output looks like this:: Active: 1 Busy: 0 - Max: 4096 + Max: 32768 If a user enables the user event via ftrace, the output would change to this:: @@ -145,21 +186,10 @@ If a user enables the user event via ftrace, the output would change to this:: Active: 1 Busy: 1 - Max: 4096 - -**NOTE:** *A status index of 0 will never be returned. This allows user -programs to have an index that can be used on error cases.* - -Status Bits -^^^^^^^^^^^ -The byte being checked will be non-zero if anything is attached. Programs can -check specific bits in the byte to see what mechanism has been attached. - -The following values are defined to aid in checking what has been attached: - -**EVENT_STATUS_FTRACE** - Bit set if ftrace has been attached (Bit 0). + Max: 32768 -**EVENT_STATUS_PERF** - Bit set if perf has been attached (Bit 1). +**NOTE:** *A status bit of 0 will never be returned. This allows user programs +to have a bit that can be used on error cases.* Writing Data ------------ -- cgit v1.2.3 From a0fcaaed0c46cf9399d3a2d6e0c87ddb3df0e044 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 29 Sep 2022 10:49:09 -0400 Subject: ring-buffer: Fix race between reset page and reading page The ring buffer is broken up into sub buffers (currently of page size). Each sub buffer has a pointer to its "tail" (the last event written to the sub buffer). When a new event is requested, the tail is locally incremented to cover the size of the new event. This is done in a way that there is no need for locking. If the tail goes past the end of the sub buffer, the process of moving to the next sub buffer takes place. After setting the current sub buffer to the next one, the previous one that had the tail go passed the end of the sub buffer needs to be reset back to the original tail location (before the new event was requested) and the rest of the sub buffer needs to be "padded". The race happens when a reader takes control of the sub buffer. As readers do a "swap" of sub buffers from the ring buffer to get exclusive access to the sub buffer, it replaces the "head" sub buffer with an empty sub buffer that goes back into the writable portion of the ring buffer. This swap can happen as soon as the writer moves to the next sub buffer and before it updates the last sub buffer with padding. Because the sub buffer can be released to the reader while the writer is still updating the padding, it is possible for the reader to see the event that goes past the end of the sub buffer. This can cause obvious issues. To fix this, add a few memory barriers so that the reader definitely sees the updates to the sub buffer, and also waits until the writer has put back the "tail" of the sub buffer back to the last event that was written on it. To be paranoid, it will only spin for 1 second, otherwise it will warn and shutdown the ring buffer code. 1 second should be enough as the writer does have preemption disabled. If the writer doesn't move within 1 second (with preemption disabled) something is horribly wrong. No interrupt should last 1 second! Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369 Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area") Reported-by: Jiazi.Li Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3046deacf7b3..c3f354cfc5ba 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2648,6 +2648,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Mark the rest of the page with padding */ rb_event_set_padding(event); + /* Make sure the padding is visible before the write update */ + smp_wmb(); + /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; @@ -2659,6 +2662,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); @@ -4627,6 +4633,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see rb_reset_tail()) + */ + smp_rmb(); + + return reader; } -- cgit v1.2.3 From 0e0f0b74f895942afaf2f9213b97a6cb021d05df Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 28 Sep 2022 12:46:21 +0100 Subject: MAINTAINERS: add myself as a tracing reviewer Since I'm actively involved in a number of arch bits that intersect ftrace (e.g. the actual arch implementation on arm64, stacktracing, entry management, and general instrumentation safety), add myself as a reviewer of the core ftrace code so that I have the change to catch any potential problems early. I spoke with Steven about this at LPC, and it seemed to make sense to add me as a reviewer. Link: https://lkml.kernel.org/r/20220928114621.248038-1-mark.rutland@arm.com Cc: Ingo Molnar Signed-off-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f5ca4aefd184..ef2fc5e581b4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20615,6 +20615,7 @@ F: drivers/char/tpm/ TRACING M: Steven Rostedt M: Ingo Molnar +R: Mark Rutland S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git F: Documentation/trace/ftrace.rst -- cgit v1.2.3 From ed87277f122674a943239c6e60d352c8d56deb50 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Fri, 30 Sep 2022 18:32:36 +0800 Subject: tracing: Remove unused variable 'dups' Reported by Clang [-Wunused-but-set-variable] 'commit c193707dde77 ("tracing: Remove code which merges duplicates")' This commit removed the code which merges duplicates in detect_dups(), but forgot to delete the variable 'dups' which used to merge duplicates in the loop. Now only 'total_dups' is needed, remove 'dups' for clean code. Link: https://lkml.kernel.org/r/20220930103236.253985-1-chenzhongjin@huawei.com Signed-off-by: Chen Zhongjin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9901708ce6b8..c774e560f2f9 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { - unsigned int dups = 0, total_dups = 0; + unsigned int total_dups = 0; int i; void *key; @@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries, key = sort_entries[0]->key; for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { - dups++; total_dups++; + total_dups++; continue; } key = sort_entries[i]->key; - dups = 0; } WARN_ONCE(total_dups > 0, -- cgit v1.2.3 From 5d8d2bb946dd24d00c99384471ad8148571a9fbd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 30 Sep 2022 12:41:31 -0400 Subject: tracing: Add Masami Hiramatsu as co-maintainer Masami has been maintaining kprobes for a while now and that code has been an integral part of tracing. He has also been an excellent reviewer of all the tracing code and contributor as well. The tracing subsystem needs another active maintainer to keep it running smoothly, and I do not know anyone more qualified for the job than Masami. Ingo has also told me that he has not been active in the tracing code for some time and said he could be removed from the TRACING portion of the MAINTAINERS file. Link: https://lkml.kernel.org/r/20220930124131.7b6432dd@gandalf.local.home Acked-by: Ingo Molnar Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ef2fc5e581b4..749558b09464 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20614,7 +20614,7 @@ F: drivers/char/tpm/ TRACING M: Steven Rostedt -M: Ingo Molnar +M: Masami Hiramatsu R: Mark Rutland S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git @@ -20631,7 +20631,7 @@ F: tools/testing/selftests/ftrace/ TRACING MMIO ACCESSES (MMIOTRACE) M: Steven Rostedt -M: Ingo Molnar +M: Masami Hiramatsu R: Karol Herbst R: Pekka Paalanen L: linux-kernel@vger.kernel.org -- cgit v1.2.3 From e5d271812e7a4d527e65b0228b4a16795c0e0c6c Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 30 Sep 2022 17:10:16 -0700 Subject: tracing/user_events: Move pages/locks into groups to prepare for namespaces In order to enable namespaces or any sort of isolation within user_events the register lock and pages need to be broken up into groups. Each event and file now has a group pointer which stores the actual pages to map, lookup data and synchronization objects. This only enables a single group that maps to init_user_ns, as IMA namespace has done. This enables user_events to start the work of supporting namespaces by walking the namespaces up to the init_user_ns. Future patches will address other user namespaces and will align to the approaches the IMA namespace uses. Link: https://lore.kernel.org/linux-kernel/20220915193221.1728029-15-stefanb@linux.ibm.com/#t Link: https://lkml.kernel.org/r/20221001001016.2832-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 346 +++++++++++++++++++++++++++++++-------- 1 file changed, 274 insertions(+), 72 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 2c0a6ec75548..ae78c2d53c8a 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -74,11 +74,25 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) -static char *register_page_data; +/* + * Stores the pages, tables, and locks for a group of events. + * Each logical grouping of events has its own group, with a + * matching page for status checks within user programs. This + * allows for isolation of events to user programs by various + * means. + */ +struct user_event_group { + struct page *pages; + char *register_page_data; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; + DECLARE_HASHTABLE(register_table, 8); + DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +}; -static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 8); -static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +/* Group for init_user_ns mapping, top-most group */ +static struct user_event_group *init_group; /* * Stores per-event properties, as users register events @@ -88,6 +102,7 @@ static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); * refcnt reaches one. */ struct user_event { + struct user_event_group *group; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -114,6 +129,11 @@ struct user_event_refs { struct user_event *events[]; }; +struct user_event_file_info { + struct user_event_group *group; + struct user_event_refs *refs; +}; + #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) @@ -126,7 +146,8 @@ struct user_event_validator { typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser); static u32 user_event_key(char *name) @@ -134,12 +155,128 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static void set_page_reservations(char *pages, bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = pages + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static void user_event_group_destroy(struct user_event_group *group) +{ + if (group->register_page_data) + set_page_reservations(group->register_page_data, false); + + if (group->pages) + __free_pages(group->pages, MAX_PAGE_ORDER); + + kfree(group->system_name); + kfree(group); +} + +static char *user_event_group_system_name(struct user_namespace *user_ns) +{ + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; + + if (user_ns != &init_user_ns) { + /* + * Unexpected at this point: + * We only currently support init_user_ns. + * When we enable more, this will trigger a failure so log. + */ + pr_warn("user_events: Namespace other than init_user_ns!\n"); + return NULL; + } + + system_name = kmalloc(len, GFP_KERNEL); + + if (!system_name) + return NULL; + + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + + return system_name; +} + +static inline struct user_event_group +*user_event_group_from_user_ns(struct user_namespace *user_ns) +{ + if (user_ns == &init_user_ns) + return init_group; + + return NULL; +} + +static struct user_event_group *current_user_event_group(void) +{ + struct user_namespace *user_ns = current_user_ns(); + struct user_event_group *group = NULL; + + while (user_ns) { + group = user_event_group_from_user_ns(user_ns); + + if (group) + break; + + user_ns = user_ns->parent; + } + + return group; +} + +static struct user_event_group +*user_event_group_create(struct user_namespace *user_ns) +{ + struct user_event_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + + if (!group) + return NULL; + + group->system_name = user_event_group_system_name(user_ns); + + if (!group->system_name) + goto error; + + group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + + if (!group->pages) + goto error; + + group->register_page_data = page_address(group->pages); + + set_page_reservations(group->register_page_data, true); + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(group->page_bitmap, MAX_EVENTS); + set_bit(0, group->page_bitmap); + + mutex_init(&group->reg_mutex); + hash_init(group->register_table); + + return group; +error: + if (group) + user_event_group_destroy(group); + + return NULL; +}; + static __always_inline void user_event_register_set(struct user_event *user) { int i = user->index; - register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); + user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); } static __always_inline @@ -147,7 +284,7 @@ void user_event_register_clear(struct user_event *user) { int i = user->index; - register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); + user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); } static __always_inline __must_check @@ -191,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * * Upon success user_event has its ref count increased by 1. */ -static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +static int user_event_parse_cmd(struct user_event_group *group, + char *raw_command, struct user_event **newuser) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -205,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) if (flags) *flags++ = '\0'; - return user_event_parse(name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser); } static int user_field_array_size(const char *type) @@ -693,7 +831,7 @@ static int destroy_user_event(struct user_event *user) dyn_event_remove(&user->devent); user_event_register_clear(user); - clear_bit(user->index, page_bitmap); + clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); user_event_destroy_validators(user); @@ -704,14 +842,15 @@ static int destroy_user_event(struct user_event *user) return ret; } -static struct user_event *find_user_event(char *name, u32 *outkey) +static struct user_event *find_user_event(struct user_event_group *group, + char *name, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(register_table, user, node, key) + hash_for_each_possible(group->register_table, user, node, key) if (!strcmp(EVENT_NAME(user), name)) { refcount_inc(&user->refcnt); return user; @@ -943,6 +1082,7 @@ dec: static int user_event_create(const char *raw_command) { + struct user_event_group *group; struct user_event *user; char *name; int ret; @@ -958,14 +1098,19 @@ static int user_event_create(const char *raw_command) if (!name) return -ENOMEM; - mutex_lock(®_mutex); + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(name, &user); + ret = user_event_parse_cmd(group, name, &user); if (!ret) refcount_dec(&user->refcnt); - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); if (ret) kfree(name); @@ -1119,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user) * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. */ -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser) { int ret; @@ -1129,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags, /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(name, &key); + user = find_user_event(group, name, &key); mutex_unlock(&event_mutex); if (user) { @@ -1142,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags, return 0; } - index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); if (index == MAX_EVENTS) return -EMFILE; @@ -1156,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags, INIT_LIST_HEAD(&user->fields); INIT_LIST_HEAD(&user->validators); + user->group = group; user->tracepoint.name = name; ret = user_event_parse_fields(user, args); @@ -1174,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags, user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; + user->class.system = group->system_name; - user->class.system = USER_EVENTS_SYSTEM; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; user->class.reg = user_event_reg; @@ -1198,8 +1345,8 @@ static int user_event_parse(char *name, char *args, char *flags, dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, page_bitmap); - hash_add(register_table, &user->node, key); + set_bit(user->index, group->page_bitmap); + hash_add(group->register_table, &user->node, key); mutex_unlock(&event_mutex); @@ -1217,10 +1364,10 @@ put_user: /* * Deletes a previously created event if it is no longer being used. */ -static int delete_user_event(char *name) +static int delete_user_event(struct user_event_group *group, char *name) { u32 key; - struct user_event *user = find_user_event(name, &key); + struct user_event *user = find_user_event(group, name, &key); if (!user) return -ENOENT; @@ -1238,6 +1385,7 @@ static int delete_user_event(char *name) */ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) { + struct user_event_file_info *info = file->private_data; struct user_event_refs *refs; struct user_event *user = NULL; struct tracepoint *tp; @@ -1249,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) rcu_read_lock_sched(); - refs = rcu_dereference_sched(file->private_data); + refs = rcu_dereference_sched(info->refs); /* * The refs->events array is protected by RCU, and new items may be @@ -1307,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) return ret; } +static int user_events_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + struct user_event_file_info *info; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return -ENOMEM; + + info->group = group; + + file->private_data = info; + + return 0; +} + static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { @@ -1328,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) return user_events_write_core(kp->ki_filp, i); } -static int user_events_ref_add(struct file *file, struct user_event *user) +static int user_events_ref_add(struct user_event_file_info *info, + struct user_event *user) { + struct user_event_group *group = info->group; struct user_event_refs *refs, *new_refs; int i, size, count = 0; - refs = rcu_dereference_protected(file->private_data, - lockdep_is_held(®_mutex)); + refs = rcu_dereference_protected(info->refs, + lockdep_is_held(&group->reg_mutex)); if (refs) { count = refs->count; @@ -1360,7 +1532,7 @@ static int user_events_ref_add(struct file *file, struct user_event *user) refcount_inc(&user->refcnt); - rcu_assign_pointer(file->private_data, new_refs); + rcu_assign_pointer(info->refs, new_refs); if (refs) kfree_rcu(refs, rcu); @@ -1397,7 +1569,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) /* * Registers a user_event on behalf of a user process. */ -static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +static long user_events_ioctl_reg(struct user_event_file_info *info, + unsigned long uarg) { struct user_reg __user *ureg = (struct user_reg __user *)uarg; struct user_reg reg; @@ -1418,14 +1591,14 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) return ret; } - ret = user_event_parse_cmd(name, &user); + ret = user_event_parse_cmd(info->group, name, &user); if (ret) { kfree(name); return ret; } - ret = user_events_ref_add(file, user); + ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ refcount_dec(&user->refcnt); @@ -1443,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) /* * Deletes a user_event on behalf of a user process. */ -static long user_events_ioctl_del(struct file *file, unsigned long uarg) +static long user_events_ioctl_del(struct user_event_file_info *info, + unsigned long uarg) { void __user *ubuf = (void __user *)uarg; char *name; @@ -1456,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) /* event_mutex prevents dyn_event from racing */ mutex_lock(&event_mutex); - ret = delete_user_event(name); + ret = delete_user_event(info->group, name); mutex_unlock(&event_mutex); kfree(name); @@ -1470,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) static long user_events_ioctl(struct file *file, unsigned int cmd, unsigned long uarg) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group = info->group; long ret = -ENOTTY; switch (cmd) { case DIAG_IOCSREG: - mutex_lock(®_mutex); - ret = user_events_ioctl_reg(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_reg(info, uarg); + mutex_unlock(&group->reg_mutex); break; case DIAG_IOCSDEL: - mutex_lock(®_mutex); - ret = user_events_ioctl_del(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_del(info, uarg); + mutex_unlock(&group->reg_mutex); break; } @@ -1494,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd, */ static int user_events_release(struct inode *node, struct file *file) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group; struct user_event_refs *refs; struct user_event *user; int i; + if (!info) + return -EINVAL; + + group = info->group; + /* * Ensure refs cannot change under any situation by taking the * register mutex during the final freeing of the references. */ - mutex_lock(®_mutex); + mutex_lock(&group->reg_mutex); - refs = file->private_data; + refs = info->refs; if (!refs) goto out; @@ -1523,32 +1706,51 @@ static int user_events_release(struct inode *node, struct file *file) out: file->private_data = NULL; - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); kfree(refs); + kfree(info); return 0; } static const struct file_operations user_data_fops = { + .open = user_events_open, .write = user_events_write, .write_iter = user_events_write_iter, .unlocked_ioctl = user_events_ioctl, .release = user_events_release, }; +static struct user_event_group *user_status_group(struct file *file) +{ + struct seq_file *m = file->private_data; + + if (!m) + return NULL; + + return m->private; +} + /* * Maps the shared page into the user process for checking if event is enabled. */ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) { + char *pages; + struct user_event_group *group = user_status_group(file); unsigned long size = vma->vm_end - vma->vm_start; if (size != MAX_BYTES) return -EINVAL; + if (!group) + return -EINVAL; + + pages = group->register_page_data; + return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(register_page_data) >> PAGE_SHIFT, + virt_to_phys(pages) >> PAGE_SHIFT, size, vm_get_page_prot(VM_READ)); } @@ -1572,13 +1774,17 @@ static void user_seq_stop(struct seq_file *m, void *p) static int user_seq_show(struct seq_file *m, void *p) { + struct user_event_group *group = m->private; struct user_event *user; char status; int i, active = 0, busy = 0, flags; - mutex_lock(®_mutex); + if (!group) + return -EINVAL; + + mutex_lock(&group->reg_mutex); - hash_for_each(register_table, i, user, node) { + hash_for_each(group->register_table, i, user, node) { status = user->status; flags = user->flags; @@ -1602,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p) active++; } - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); @@ -1621,7 +1827,24 @@ static const struct seq_operations user_seq_ops = { static int user_status_open(struct inode *node, struct file *file) { - return seq_open(file, &user_seq_ops); + struct user_event_group *group; + int ret; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + ret = seq_open(file, &user_seq_ops); + + if (!ret) { + /* Chain group to seq_file */ + struct seq_file *m = file->private_data; + + m->private = group; + } + + return ret; } static const struct file_operations user_status_fops = { @@ -1662,42 +1885,21 @@ err: return -ENODEV; } -static void set_page_reservations(bool set) -{ - int page; - - for (page = 0; page < MAX_PAGES; ++page) { - void *addr = register_page_data + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static int __init trace_events_user_init(void) { - struct page *pages; int ret; - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(page_bitmap, MAX_EVENTS); - set_bit(0, page_bitmap); + init_group = user_event_group_create(&init_user_ns); - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - if (!pages) + if (!init_group) return -ENOMEM; - register_page_data = page_address(pages); - - set_page_reservations(true); ret = create_user_tracefs(); if (ret) { pr_warn("user_events could not register with tracefs\n"); - set_page_reservations(false); - __free_pages(pages, MAX_PAGE_ORDER); + user_event_group_destroy(init_group); + init_group = NULL; return ret; } -- cgit v1.2.3 From cf04f2d5df0037741207382ac8fe289e8bf84ced Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 5 Oct 2022 00:38:09 -0400 Subject: ftrace: Still disable enabled records marked as disabled Weak functions started causing havoc as they showed up in the "available_filter_functions" and this confused people as to why some functions marked as "notrace" were listed, but when enabled they did nothing. This was because weak functions can still have fentry calls, and these addresses get added to the "available_filter_functions" file. kallsyms is what converts those addresses to names, and since the weak functions are not listed in kallsyms, it would just pick the function before that. To solve this, there was a trick to detect weak functions listed, and these records would be marked as DISABLED so that they do not get enabled and are mostly ignored. As the processing of the list of all functions to figure out what is weak or not can take a long time, this process is put off into a kernel thread and run in parallel with the rest of start up. Now the issue happens whet function tracing is enabled via the kernel command line. As it starts very early in boot up, it can be enabled before the records that are weak are marked to be disabled. This causes an issue in the accounting, as the weak records are enabled by the command line function tracing, but after boot up, they are not disabled. The ftrace records have several accounting flags and a ref count. The DISABLED flag is just one. If the record is enabled before it is marked DISABLED it will get an ENABLED flag and also have its ref counter incremented. After it is marked for DISABLED, neither the ENABLED flag nor the ref counter is cleared. There's sanity checks on the records that are performed after an ftrace function is registered or unregistered, and this detected that there were records marked as ENABLED with ref counter that should not have been. Note, the module loading code uses the DISABLED flag as well to keep its functions from being modified while its being loaded and some of these flags may get set in this process. So changing the verification code to ignore DISABLED records is a no go, as it still needs to verify that the module records are working too. Also, the weak functions still are calling a trampoline. Even though they should never be called, it is dangerous to leave these weak functions calling a trampoline that is freed, so they should still be set back to nops. There's two places that need to not skip records that have the ENABLED and the DISABLED flags set. That is where the ftrace_ops is processed and sets the records ref counts, and then later when the function itself is to be updated, and the ENABLED flag gets removed. Add a helper function "skip_record()" that returns true if the record has the DISABLED flag set but not the ENABLED flag. Link: https://lkml.kernel.org/r/20221005003809.27d2b97b@gandalf.local.home Cc: Masami Hiramatsu Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: b39181f7c6907 ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 406d0597c409..83362a155791 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1644,6 +1644,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); +static bool skip_record(struct dyn_ftrace *rec) +{ + /* + * At boot up, weak functions are set to disable. Function tracing + * can be enabled before they are, and they still need to be disabled now. + * If the record is disabled, still continue if it is marked as already + * enabled (this is needed to keep the accounting working). + */ + return rec->flags & FTRACE_FL_DISABLED && + !(rec->flags & FTRACE_FL_ENABLED); +} + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1693,7 +1705,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; if (all) { @@ -2126,7 +2138,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) ftrace_bug_type = FTRACE_BUG_UNKNOWN; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) return FTRACE_UPDATE_IGNORE; /* @@ -2241,7 +2253,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) if (update) { /* If there's no more users, clear all flags */ if (!ftrace_rec_count(rec)) - rec->flags = 0; + rec->flags &= FTRACE_FL_DISABLED; else /* * Just disable the record, but keep the ops TRAMP @@ -2634,7 +2646,7 @@ void __weak ftrace_replace_code(int mod_flags) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; failed = __ftrace_replace_code(rec, enable); -- cgit v1.2.3 From a541a9559bb0a8ecc434de01d3e4826c32e8bb53 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 5 Oct 2022 11:37:57 -0400 Subject: tracing: Do not free snapshot if tracer is on cmdline The ftrace_boot_snapshot and alloc_snapshot cmdline options allocate the snapshot buffer at boot up for use later. The ftrace_boot_snapshot in particular requires the snapshot to be allocated because it will take a snapshot at the end of boot up allowing to see the traces that happened during boot so that it's not lost when user space takes over. When a tracer is registered (started) there's a path that checks if it requires the snapshot buffer or not, and if it does not and it was allocated it will do a synchronization and free the snapshot buffer. This is only required if the previous tracer was using it for "max latency" snapshots, as it needs to make sure all max snapshots are complete before freeing. But this is only needed if the previous tracer was using the snapshot buffer for latency (like irqoff tracer and friends). But it does not make sense to free it, if the previous tracer was not using it, and the snapshot was allocated by the cmdline parameters. This basically takes away the point of allocating it in the first place! Note, the allocated snapshot worked fine for just trace events, but fails when a tracer is enabled on the cmdline. Further investigation, this goes back even further and it does not require a tracer on the cmdline to fail. Simply enable snapshots and then enable a tracer, and it will remove the snapshot. Link: https://lkml.kernel.org/r/20221005113757.041df7fe@gandalf.local.home Cc: Masami Hiramatsu Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: 45ad21ca5530 ("tracing: Have trace_array keep track if snapshot buffer is allocated") Reported-by: Ross Zwisler Tested-by: Ross Zwisler Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index def721de68a0..47a44b055a1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6428,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->current_trace->use_max_tr; + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->allocated_snapshot; - if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that @@ -6446,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } - if (t->use_max_tr && !had_max_tr) { + if (t->use_max_tr && !tr->allocated_snapshot) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } +#else + tr->current_trace = &nop_trace; #endif if (t->init) { -- cgit v1.2.3 From fb17b268396046ee8a10c0f5d30c0a7afeef77e2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 6 Oct 2022 10:43:47 -0400 Subject: tracing: Update MAINTAINERS to reflect new tracing git repo The tracing git repo will no longer be housed in my personal git repo, but instead live in trace/linux-trace.git. Update the MAINTAINERS file appropriately. Link: https://lkml.kernel.org/r/20221006144439.282193367@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 749558b09464..86b8aa4c11cb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11380,7 +11380,7 @@ M: Anil S Keshavamurthy M: "David S. Miller" M: Masami Hiramatsu S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/trace/kprobes.rst F: include/asm-generic/kprobes.h F: include/linux/kprobes.h @@ -20617,7 +20617,7 @@ M: Steven Rostedt M: Masami Hiramatsu R: Mark Rutland S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git F: Documentation/trace/ftrace.rst F: arch/*/*/*/*ftrace* F: arch/*/*/*ftrace* -- cgit v1.2.3 From 4f881a696484a7d31a4d1b12547615b1a3ee5771 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 6 Oct 2022 10:43:48 -0400 Subject: ftrace: Create separate entry in MAINTAINERS for function hooks The function hooks (ftrace) is a completely different subsystem from the general tracing. It manages how to attach callbacks to most functions in the kernel. It is also used by live kernel patching. It really is not part of tracing, although tracing uses it. Create a separate entry for FUNCTION HOOKS (FTRACE) to be separate from tracing itself in the MAINTAINERS file. Perhaps it should be moved out of the kernel/trace directory, but that's for another time. Link: https://lkml.kernel.org/r/20221006144439.459272364@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Acked-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 86b8aa4c11cb..d95f5d3b4d37 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8411,6 +8411,19 @@ L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/fujitsu-tablet.c +FUNCTION HOOKS (FTRACE) +M: Steven Rostedt +M: Masami Hiramatsu +R: Mark Rutland +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git +F: Documentation/trace/ftrace* +F: kernel/trace/ftrace* +F: kernel/trace/fgraph.c +F: arch/*/*/*/*ftrace* +F: arch/*/*/*ftrace* +F: include/*/ftrace.h + FUNGIBLE ETHERNET DRIVERS M: Dimitris Michailidis L: netdev@vger.kernel.org @@ -20615,14 +20628,10 @@ F: drivers/char/tpm/ TRACING M: Steven Rostedt M: Masami Hiramatsu -R: Mark Rutland S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git -F: Documentation/trace/ftrace.rst -F: arch/*/*/*/*ftrace* -F: arch/*/*/*ftrace* +F: Documentation/trace/* F: fs/tracefs/ -F: include/*/ftrace.h F: include/linux/trace*.h F: include/trace/ F: kernel/trace/ -- cgit v1.2.3