diff options
Diffstat (limited to 'Documentation/trace')
-rw-r--r-- | Documentation/trace/boottime-trace.rst | 91 | ||||
-rw-r--r-- | Documentation/trace/coresight/coresight-config.rst | 244 | ||||
-rw-r--r-- | Documentation/trace/coresight/coresight-etm4x-reference.rst | 2 | ||||
-rw-r--r-- | Documentation/trace/coresight/coresight.rst | 23 | ||||
-rw-r--r-- | Documentation/trace/ftrace.rst | 8 | ||||
-rw-r--r-- | Documentation/trace/histogram.rst | 112 | ||||
-rw-r--r-- | Documentation/trace/hwlat_detector.rst | 13 | ||||
-rw-r--r-- | Documentation/trace/index.rst | 2 | ||||
-rw-r--r-- | Documentation/trace/kprobes.rst | 24 | ||||
-rw-r--r-- | Documentation/trace/osnoise-tracer.rst | 152 | ||||
-rw-r--r-- | Documentation/trace/timerlat-tracer.rst | 181 |
11 files changed, 800 insertions, 52 deletions
diff --git a/Documentation/trace/boottime-trace.rst b/Documentation/trace/boottime-trace.rst index 89b64334929b..6dcfbc64014d 100644 --- a/Documentation/trace/boottime-trace.rst +++ b/Documentation/trace/boottime-trace.rst @@ -99,6 +99,12 @@ These options are setting per-event options. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.enable Enable GROUP:EVENT tracing. +ftrace.[instance.INSTANCE.]event.GROUP.enable + Enable all event tracing within GROUP. + +ftrace.[instance.INSTANCE.]event.enable + Enable all event tracing. + ftrace.[instance.INSTANCE.]event.GROUP.EVENT.filter = FILTER Set FILTER rule to the GROUP:EVENT. @@ -119,6 +125,71 @@ Note that kprobe and synthetic event definitions can be written under instance node, but those are also visible from other instances. So please take care for event name conflict. +Ftrace Histogram Options +------------------------ + +Since it is too long to write a histogram action as a string for per-event +action option, there are tree-style options under per-event 'hist' subkey +for the histogram actions. For the detail of the each parameter, +please read the event histogram document [3]_. + +.. [3] See :ref:`Documentation/trace/histogram.rst <histogram>` + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]keys = KEY1[, KEY2[...]] + Set histogram key parameters. (Mandatory) + The 'N' is a digit string for the multiple histogram. You can omit it + if there is one histogram on the event. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]values = VAL1[, VAL2[...]] + Set histogram value parameters. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]sort = SORT1[, SORT2[...]] + Set histogram sort parameter options. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]size = NR_ENTRIES + Set histogram size (number of entries). + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]name = NAME + Set histogram name. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]var.VARIABLE = EXPR + Define a new VARIABLE by EXPR expression. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]<pause|continue|clear> + Set histogram control parameter. You can set one of them. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmatch.[M.]event = GROUP.EVENT + Set histogram 'onmatch' handler matching event parameter. + The 'M' is a digit string for the multiple 'onmatch' handler. You can omit it + if there is one 'onmatch' handler on this histogram. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmatch.[M.]trace = EVENT[, ARG1[...]] + Set histogram 'trace' action for 'onmatch'. + EVENT must be a synthetic event name, and ARG1... are parameters + for that event. Mandatory if 'onmatch.event' option is set. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmax.[M.]var = VAR + Set histogram 'onmax' handler variable parameter. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onchange.[M.]var = VAR + Set histogram 'onchange' handler variable parameter. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]<onmax|onchange>.[M.]save = ARG1[, ARG2[...]] + Set histogram 'save' action parameters for 'onmax' or 'onchange' handler. + This option or below 'snapshot' option is mandatory if 'onmax.var' or + 'onchange.var' option is set. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]<onmax|onchange>.[M.]snapshot + Set histogram 'snapshot' action for 'onmax' or 'onchange' handler. + This option or above 'save' option is mandatory if 'onmax.var' or + 'onchange.var' option is set. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.filter = FILTER_EXPR + Set histogram filter expression. You don't need 'if' in the FILTER_EXPR. + +Note that this 'hist' option can conflict with the per-event 'actions' +option if the 'actions' option has a histogram action. + When to Start ============= @@ -153,13 +224,23 @@ below:: } synthetic.initcall_latency { fields = "unsigned long func", "u64 lat" - actions = "hist:keys=func.sym,lat:vals=lat:sort=lat" + hist { + keys = func.sym, lat + values = lat + sort = lat + } } - initcall.initcall_start { - actions = "hist:keys=func:ts0=common_timestamp.usecs" + initcall.initcall_start.hist { + keys = func + var.ts0 = common_timestamp.usecs } - initcall.initcall_finish { - actions = "hist:keys=func:lat=common_timestamp.usecs-$ts0:onmatch(initcall.initcall_start).initcall_latency(func,$lat)" + initcall.initcall_finish.hist { + keys = func + var.lat = common_timestamp.usecs - $ts0 + onmatch { + event = initcall.initcall_start + trace = initcall_latency, func, $lat + } } } diff --git a/Documentation/trace/coresight/coresight-config.rst b/Documentation/trace/coresight/coresight-config.rst new file mode 100644 index 000000000000..a4e3ef295240 --- /dev/null +++ b/Documentation/trace/coresight/coresight-config.rst @@ -0,0 +1,244 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================== +CoreSight System Configuration Manager +====================================== + + :Author: Mike Leach <mike.leach@linaro.org> + :Date: October 2020 + +Introduction +============ + +The CoreSight System Configuration manager is an API that allows the +programming of the CoreSight system with pre-defined configurations that +can then be easily enabled from sysfs or perf. + +Many CoreSight components can be programmed in complex ways - especially ETMs. +In addition, components can interact across the CoreSight system, often via +the cross trigger components such as CTI and CTM. These system settings can +be defined and enabled as named configurations. + + +Basic Concepts +============== + +This section introduces the basic concepts of a CoreSight system configuration. + + +Features +-------- + +A feature is a named set of programming for a CoreSight device. The programming +is device dependent, and can be defined in terms of absolute register values, +resource usage and parameter values. + +The feature is defined using a descriptor. This descriptor is used to load onto +a matching device, either when the feature is loaded into the system, or when the +CoreSight device is registered with the configuration manager. + +The load process involves interpreting the descriptor into a set of register +accesses in the driver - the resource usage and parameter descriptions +translated into appropriate register accesses. This interpretation makes it easy +and efficient for the feature to be programmed onto the device when required. + +The feature will not be active on the device until the feature is enabled, and +the device itself is enabled. When the device is enabled then enabled features +will be programmed into the device hardware. + +A feature is enabled as part of a configuration being enabled on the system. + + +Parameter Value +~~~~~~~~~~~~~~~ + +A parameter value is a named value that may be set by the user prior to the +feature being enabled that can adjust the behaviour of the operation programmed +by the feature. + +For example, this could be a count value in a programmed operation that repeats +at a given rate. When the feature is enabled then the current value of the +parameter is used in programming the device. + +The feature descriptor defines a default value for a parameter, which is used +if the user does not supply a new value. + +Users can update parameter values using the configfs API for the CoreSight +system - which is described below. + +The current value of the parameter is loaded into the device when the feature +is enabled on that device. + + +Configurations +-------------- + +A configuration defines a set of features that are to be used in a trace +session where the configuration is selected. For any trace session only one +configuration may be selected. + +The features defined may be on any type of device that is registered +to support system configuration. A configuration may select features to be +enabled on a class of devices - i.e. any ETMv4, or specific devices, e.g. a +specific CTI on the system. + +As with the feature, a descriptor is used to define the configuration. +This will define the features that must be enabled as part of the configuration +as well as any preset values that can be used to override default parameter +values. + + +Preset Values +~~~~~~~~~~~~~ + +Preset values are easily selectable sets of parameter values for the features +that the configuration uses. The number of values in a single preset set, equals +the sum of parameter values in the features used by the configuration. + +e.g. a configuration consists of 3 features, one has 2 parameters, one has +a single parameter, and another has no parameters. A single preset set will +therefore have 3 values. + +Presets are optionally defined by the configuration, up to 15 can be defined. +If no preset is selected, then the parameter values defined in the feature +are used as normal. + + +Operation +~~~~~~~~~ + +The following steps take place in the operation of a configuration. + +1) In this example, the configuration is 'autofdo', which has an + associated feature 'strobing' that works on ETMv4 CoreSight Devices. + +2) The configuration is enabled. For example 'perf' may select the + configuration as part of its command line:: + + perf record -e cs_etm/autofdo/ myapp + + which will enable the 'autofdo' configuration. + +3) perf starts tracing on the system. As each ETMv4 that perf uses for + trace is enabled, the configuration manager will check if the ETMv4 + has a feature that relates to the currently active configuration. + In this case 'strobing' is enabled & programmed into the ETMv4. + +4) When the ETMv4 is disabled, any registers marked as needing to be + saved will be read back. + +5) At the end of the perf session, the configuration will be disabled. + + +Viewing Configurations and Features +=================================== + +The set of configurations and features that are currently loaded into the +system can be viewed using the configfs API. + +Mount configfs as normal and the 'cs-syscfg' subsystem will appear:: + + $ ls /config + cs-syscfg stp-policy + +This has two sub-directories:: + + $ cd cs-syscfg/ + $ ls + configurations features + +The system has the configuration 'autofdo' built in. It may be examined as +follows:: + + $ cd configurations/ + $ ls + autofdo + $ cd autofdo/ + $ ls + description preset1 preset3 preset5 preset7 preset9 + feature_refs preset2 preset4 preset6 preset8 + $ cat description + Setup ETMs with strobing for autofdo + $ cat feature_refs + strobing + +Each preset declared has a preset<n> subdirectory declared. The values for +the preset can be examined:: + + $ cat preset1/values + strobing.window = 0x1388 strobing.period = 0x2 + $ cat preset2/values + strobing.window = 0x1388 strobing.period = 0x4 + +The features referenced by the configuration can be examined in the features +directory:: + + $ cd ../../features/strobing/ + $ ls + description matches nr_params params + $ cat description + Generate periodic trace capture windows. + parameter 'window': a number of CPU cycles (W) + parameter 'period': trace enabled for W cycles every period x W cycles + $ cat matches + SRC_ETMV4 + $ cat nr_params + 2 + +Move to the params directory to examine and adjust parameters:: + + cd params + $ ls + period window + $ cd period + $ ls + value + $ cat value + 0x2710 + # echo 15000 > value + # cat value + 0x3a98 + +Parameters adjusted in this way are reflected in all device instances that have +loaded the feature. + + +Using Configurations in perf +============================ + +The configurations loaded into the CoreSight configuration management are +also declared in the perf 'cs_etm' event infrastructure so that they can +be selected when running trace under perf:: + + $ ls /sys/devices/cs_etm + configurations format perf_event_mux_interval_ms sinks type + events nr_addr_filters power + +Key directories here are 'configurations' - which lists the loaded +configurations, and 'events' - a generic perf directory which allows +selection on the perf command line.:: + + $ ls configurations/ + autofdo + $ cat configurations/autofdo + 0xa7c3dddd + +As with the sinks entries, this provides a hash of the configuration name. +The entry in the 'events' directory uses perfs built in syntax generator +to substitute the syntax for the name when evaluating the command:: + + $ ls events/ + autofdo + $ cat events/autofdo + configid=0xa7c3dddd + +The 'autofdo' configuration may be selected on the perf command line:: + + $ perf record -e cs_etm/autofdo/u --per-thread <application> + +A preset to override the current parameter values can also be selected:: + + $ perf record -e cs_etm/autofdo,preset=1/u --per-thread <application> + +When configurations are selected in this way, then the trace sink used is +automatically selected. diff --git a/Documentation/trace/coresight/coresight-etm4x-reference.rst b/Documentation/trace/coresight/coresight-etm4x-reference.rst index b64d9a9c79df..d25dfe86af9b 100644 --- a/Documentation/trace/coresight/coresight-etm4x-reference.rst +++ b/Documentation/trace/coresight/coresight-etm4x-reference.rst @@ -427,7 +427,7 @@ the ‘TRC’ prefix. :Syntax: ``echo idx > vmid_idx`` - Where idx < numvmidc + Where idx < numvmidc ---- diff --git a/Documentation/trace/coresight/coresight.rst b/Documentation/trace/coresight/coresight.rst index 169749efd8d1..a15571d96cc8 100644 --- a/Documentation/trace/coresight/coresight.rst +++ b/Documentation/trace/coresight/coresight.rst @@ -315,7 +315,8 @@ intermediate links as required. Note: ``cti_sys0`` appears in two of the connections lists above. CTIs can connect to multiple devices and are arranged in a star topology -via the CTM. See (:doc:`coresight-ect`) [#fourth]_ for further details. +via the CTM. See (Documentation/trace/coresight/coresight-ect.rst) +[#fourth]_ for further details. Looking at this device we see 4 connections:: linaro-developer:~# ls -l /sys/bus/coresight/devices/cti_sys0/connections @@ -606,7 +607,8 @@ interface provided for that purpose by the generic STM API:: crw------- 1 root root 10, 61 Jan 3 18:11 /dev/stm0 root@genericarmv8:~# -Details on how to use the generic STM API can be found here:- :doc:`../stm` [#second]_. +Details on how to use the generic STM API can be found here: +- Documentation/trace/stm.rst [#second]_. The CTI & CTM Modules --------------------- @@ -616,7 +618,20 @@ individual CTIs and components, and can propagate these between all CTIs via channels on the CTM (Cross Trigger Matrix). A separate documentation file is provided to explain the use of these devices. -(:doc:`coresight-ect`) [#fourth]_. +(Documentation/trace/coresight/coresight-ect.rst) [#fourth]_. + +CoreSight System Configuration +------------------------------ + +CoreSight components can be complex devices with many programming options. +Furthermore, components can be programmed to interact with each other across the +complete system. + +A CoreSight System Configuration manager is provided to allow these complex programming +configurations to be selected and used easily from perf and sysfs. + +See the separate document for further information. +(Documentation/trace/coresight/coresight-config.rst) [#fifth]_. .. [#first] Documentation/ABI/testing/sysfs-bus-coresight-devices-stm @@ -626,3 +641,5 @@ A separate documentation file is provided to explain the use of these devices. .. [#third] https://github.com/Linaro/perf-opencsd .. [#fourth] Documentation/trace/coresight/coresight-ect.rst + +.. [#fifth] Documentation/trace/coresight/coresight-config.rst diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 62c98e9bbdd9..4e5b26f03d5b 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -40,7 +40,7 @@ See events.rst for more information. Implementation Details ---------------------- -See :doc:`ftrace-design` for details for arch porters and such. +See Documentation/trace/ftrace-design.rst for details for arch porters and such. The File System @@ -354,8 +354,8 @@ of ftrace. Here is a list of some of the key files: is being directly called by the function. If the count is greater than 1 it most likely will be ftrace_ops_list_func(). - If the callback of the function jumps to a trampoline that is - specific to a the callback and not the standard trampoline, + If the callback of a function jumps to a trampoline that is + specific to the callback and which is not the standard trampoline, its address will be printed as well as the function that the trampoline calls. @@ -2762,7 +2762,7 @@ listed in: put_prev_task_idle kmem_cache_create pick_next_task_rt - get_online_cpus + cpus_read_lock pick_next_task_fair mutex_lock [...] diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index b71e09f745c3..533415644c54 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -70,15 +70,16 @@ Documentation written by Tom Zanussi modified by appending any of the following modifiers to the field name: - =========== ========================================== - .hex display a number as a hex value - .sym display an address as a symbol - .sym-offset display an address as a symbol and offset - .syscall display a syscall id as a system call name - .execname display a common_pid as a program name - .log2 display log2 value rather than raw number - .usecs display a common_timestamp in microseconds - =========== ========================================== + ============= ================================================= + .hex display a number as a hex value + .sym display an address as a symbol + .sym-offset display an address as a symbol and offset + .syscall display a syscall id as a system call name + .execname display a common_pid as a program name + .log2 display log2 value rather than raw number + .buckets=size display grouping of values rather than raw number + .usecs display a common_timestamp in microseconds + ============= ================================================= Note that in general the semantics of a given field aren't interpreted when applying a modifier to it, but there are some @@ -191,7 +192,7 @@ Documentation written by Tom Zanussi with the event, in nanoseconds. May be modified by .usecs to have timestamps interpreted as microseconds. - cpu int the cpu on which the event occurred. + common_cpu int the cpu on which the event occurred. ====================== ==== ======================================= Extended error information @@ -228,7 +229,7 @@ Extended error information that lists the total number of bytes requested for each function in the kernel that made one or more calls to kmalloc:: - # echo 'hist:key=call_site:val=bytes_req' > \ + # echo 'hist:key=call_site:val=bytes_req.buckets=32' > \ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger This tells the tracing system to create a 'hist' trigger using the @@ -1823,20 +1824,99 @@ and variables defined on other events (see Section 2.2.3 below on how that is done using hist trigger 'onmatch' action). Once that is done, the 'wakeup_latency' synthetic event instance is created. -A histogram can now be defined for the new synthetic event:: - - # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ - /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger - The new event is created under the tracing/events/synthetic/ directory and looks and behaves just like any other event:: # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency enable filter format hist id trigger +A histogram can now be defined for the new synthetic event:: + + # echo 'hist:keys=pid,prio,lat.log2:sort=lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + +The above shows the latency "lat" in a power of 2 grouping. + Like any other event, once a histogram is enabled for the event, the output can be displayed by reading the event's 'hist' file. + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist + + # event histogram + # + # trigger info: hist:keys=pid,prio,lat.log2:vals=hitcount:sort=lat.log2:size=2048 [active] + # + + { pid: 2035, prio: 9, lat: ~ 2^2 } hitcount: 43 + { pid: 2034, prio: 9, lat: ~ 2^2 } hitcount: 60 + { pid: 2029, prio: 9, lat: ~ 2^2 } hitcount: 965 + { pid: 2034, prio: 120, lat: ~ 2^2 } hitcount: 9 + { pid: 2033, prio: 120, lat: ~ 2^2 } hitcount: 5 + { pid: 2030, prio: 9, lat: ~ 2^2 } hitcount: 335 + { pid: 2030, prio: 120, lat: ~ 2^2 } hitcount: 10 + { pid: 2032, prio: 120, lat: ~ 2^2 } hitcount: 1 + { pid: 2035, prio: 120, lat: ~ 2^2 } hitcount: 2 + { pid: 2031, prio: 9, lat: ~ 2^2 } hitcount: 176 + { pid: 2028, prio: 120, lat: ~ 2^2 } hitcount: 15 + { pid: 2033, prio: 9, lat: ~ 2^2 } hitcount: 91 + { pid: 2032, prio: 9, lat: ~ 2^2 } hitcount: 125 + { pid: 2029, prio: 120, lat: ~ 2^2 } hitcount: 4 + { pid: 2031, prio: 120, lat: ~ 2^2 } hitcount: 3 + { pid: 2029, prio: 120, lat: ~ 2^3 } hitcount: 2 + { pid: 2035, prio: 9, lat: ~ 2^3 } hitcount: 41 + { pid: 2030, prio: 120, lat: ~ 2^3 } hitcount: 1 + { pid: 2032, prio: 9, lat: ~ 2^3 } hitcount: 32 + { pid: 2031, prio: 9, lat: ~ 2^3 } hitcount: 44 + { pid: 2034, prio: 9, lat: ~ 2^3 } hitcount: 40 + { pid: 2030, prio: 9, lat: ~ 2^3 } hitcount: 29 + { pid: 2033, prio: 9, lat: ~ 2^3 } hitcount: 31 + { pid: 2029, prio: 9, lat: ~ 2^3 } hitcount: 31 + { pid: 2028, prio: 120, lat: ~ 2^3 } hitcount: 18 + { pid: 2031, prio: 120, lat: ~ 2^3 } hitcount: 2 + { pid: 2028, prio: 120, lat: ~ 2^4 } hitcount: 1 + { pid: 2029, prio: 9, lat: ~ 2^4 } hitcount: 4 + { pid: 2031, prio: 120, lat: ~ 2^7 } hitcount: 1 + { pid: 2032, prio: 120, lat: ~ 2^7 } hitcount: 1 + + Totals: + Hits: 2122 + Entries: 30 + Dropped: 0 + + +The latency values can also be grouped linearly by a given size with +the ".buckets" modifier and specify a size (in this case groups of 10). + + # echo 'hist:keys=pid,prio,lat.buckets=10:sort=lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + + # event histogram + # + # trigger info: hist:keys=pid,prio,lat.buckets=10:vals=hitcount:sort=lat.buckets=10:size=2048 [active] + # + + { pid: 2067, prio: 9, lat: ~ 0-9 } hitcount: 220 + { pid: 2068, prio: 9, lat: ~ 0-9 } hitcount: 157 + { pid: 2070, prio: 9, lat: ~ 0-9 } hitcount: 100 + { pid: 2067, prio: 120, lat: ~ 0-9 } hitcount: 6 + { pid: 2065, prio: 120, lat: ~ 0-9 } hitcount: 2 + { pid: 2066, prio: 120, lat: ~ 0-9 } hitcount: 2 + { pid: 2069, prio: 9, lat: ~ 0-9 } hitcount: 122 + { pid: 2069, prio: 120, lat: ~ 0-9 } hitcount: 8 + { pid: 2070, prio: 120, lat: ~ 0-9 } hitcount: 1 + { pid: 2068, prio: 120, lat: ~ 0-9 } hitcount: 7 + { pid: 2066, prio: 9, lat: ~ 0-9 } hitcount: 365 + { pid: 2064, prio: 120, lat: ~ 0-9 } hitcount: 35 + { pid: 2065, prio: 9, lat: ~ 0-9 } hitcount: 998 + { pid: 2071, prio: 9, lat: ~ 0-9 } hitcount: 85 + { pid: 2065, prio: 9, lat: ~ 10-19 } hitcount: 2 + { pid: 2064, prio: 120, lat: ~ 10-19 } hitcount: 2 + + Totals: + Hits: 2112 + Entries: 16 + Dropped: 0 + 2.2.3 Hist trigger 'handlers' and 'actions' ------------------------------------------- diff --git a/Documentation/trace/hwlat_detector.rst b/Documentation/trace/hwlat_detector.rst index 5739349649c8..de94b499b0bc 100644 --- a/Documentation/trace/hwlat_detector.rst +++ b/Documentation/trace/hwlat_detector.rst @@ -76,8 +76,13 @@ in /sys/kernel/tracing: - tracing_cpumask - the CPUs to move the hwlat thread across - hwlat_detector/width - specified amount of time to spin within window (usecs) - hwlat_detector/window - amount of time between (width) runs (usecs) + - hwlat_detector/mode - the thread mode -The hwlat detector's kernel thread will migrate across each CPU specified in -tracing_cpumask between each window. To limit the migration, either modify -tracing_cpumask, or modify the hwlat kernel thread (named [hwlatd]) CPU -affinity directly, and the migration will stop. +By default, one hwlat detector's kernel thread will migrate across each CPU +specified in cpumask at the beginning of a new window, in a round-robin +fashion. This behavior can be changed by changing the thread mode, +the available options are: + + - none: do not force migration + - round-robin: migrate across each CPU specified in cpumask [default] + - per-cpu: create one thread for each cpu in tracing_cpumask diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index f634b36fd3aa..3769b9b7aed8 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -23,6 +23,8 @@ Linux Tracing Technologies histogram-design boottime-trace hwlat_detector + osnoise-tracer + timerlat-tracer intel_th ring-buffer-design stm diff --git a/Documentation/trace/kprobes.rst b/Documentation/trace/kprobes.rst index b757b6dfd3d4..998149ce2fd9 100644 --- a/Documentation/trace/kprobes.rst +++ b/Documentation/trace/kprobes.rst @@ -362,14 +362,11 @@ register_kprobe #include <linux/kprobes.h> int register_kprobe(struct kprobe *kp); -Sets a breakpoint at the address kp->addr. When the breakpoint is -hit, Kprobes calls kp->pre_handler. After the probed instruction -is single-stepped, Kprobe calls kp->post_handler. If a fault -occurs during execution of kp->pre_handler or kp->post_handler, -or during single-stepping of the probed instruction, Kprobes calls -kp->fault_handler. Any or all handlers can be NULL. If kp->flags -is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, -so, its handlers aren't hit until calling enable_kprobe(kp). +Sets a breakpoint at the address kp->addr. When the breakpoint is hit, Kprobes +calls kp->pre_handler. After the probed instruction is single-stepped, Kprobe +calls kp->post_handler. Any or all handlers can be NULL. If kp->flags is set +KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, its handlers +aren't hit until calling enable_kprobe(kp). .. note:: @@ -415,17 +412,6 @@ User's post-handler (kp->post_handler):: p and regs are as described for the pre_handler. flags always seems to be zero. -User's fault-handler (kp->fault_handler):: - - #include <linux/kprobes.h> - #include <linux/ptrace.h> - int fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr); - -p and regs are as described for the pre_handler. trapnr is the -architecture-specific trap number associated with the fault (e.g., -on i386, 13 for a general protection fault or 14 for a page fault). -Returns 1 if it successfully handled the exception. - register_kretprobe ------------------ diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst new file mode 100644 index 000000000000..b648cb9bf1f0 --- /dev/null +++ b/Documentation/trace/osnoise-tracer.rst @@ -0,0 +1,152 @@ +============== +OSNOISE Tracer +============== + +In the context of high-performance computing (HPC), the Operating System +Noise (*osnoise*) refers to the interference experienced by an application +due to activities inside the operating system. In the context of Linux, +NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the +system. Moreover, hardware-related jobs can also cause noise, for example, +via SMIs. + +hwlat_detector is one of the tools used to identify the most complex +source of noise: *hardware noise*. + +In a nutshell, the hwlat_detector creates a thread that runs +periodically for a given period. At the beginning of a period, the thread +disables interrupt and starts sampling. While running, the hwlatd +thread reads the time in a loop. As interrupts are disabled, threads, +IRQs, and SoftIRQs cannot interfere with the hwlatd thread. Hence, the +cause of any gap between two different reads of the time roots either on +NMI or in the hardware itself. At the end of the period, hwlatd enables +interrupts and reports the max observed gap between the reads. It also +prints a NMI occurrence counter. If the output does not report NMI +executions, the user can conclude that the hardware is the culprit for +the latency. The hwlat detects the NMI execution by observing +the entry and exit of a NMI. + +The osnoise tracer leverages the hwlat_detector by running a +similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing +all the sources of *osnoise* during its execution. Using the same approach +of hwlat, osnoise takes note of the entry and exit point of any +source of interferences, increasing a per-cpu interference counter. The +osnoise tracer also saves an interference counter for each source of +interference. The interference counter for NMI, IRQs, SoftIRQs, and +threads is increased anytime the tool observes these interferences' entry +events. When a noise happens without any interference from the operating +system level, the hardware noise counter increases, pointing to a +hardware-related noise. In this way, osnoise can account for any +source of interference. At the end of the period, the osnoise tracer +prints the sum of all noise, the max single noise, the percentage of CPU +available for the thread, and the counters for the noise sources. + +Usage +----- + +Write the ASCII text "osnoise" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo osnoise > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: osnoise + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth MAX + # || / SINGLE Interference counters: + # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ + # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD + # | | | |||| | | | | | | | | | | + <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 + <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 + <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 + <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 + <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 + <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 + <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 + <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19 + +In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the +tracer prints a message at the end of each period for each CPU that is +running an osnoise/ thread. The osnoise specific fields report: + + - The RUNTIME IN US reports the amount of time in microseconds that + the osnoise thread kept looping reading the time. + - The NOISE IN US reports the sum of noise in microseconds observed + by the osnoise tracer during the associated runtime. + - The % OF CPU AVAILABLE reports the percentage of CPU available for + the osnoise thread during the runtime window. + - The MAX SINGLE NOISE IN US reports the maximum single noise observed + during the runtime window. + - The Interference counters display how many each of the respective + interference happened during the runtime window. + +Note that the example above shows a high number of HW noise samples. +The reason being is that this sample was taken on a virtual machine, +and the host interference is detected as a hardware interference. + +Tracer options +--------------------- + +The tracer has a set of options inside the osnoise directory, they are: + + - osnoise/cpus: CPUs at which a osnoise thread will execute. + - osnoise/period_us: the period of the osnoise thread. + - osnoise/runtime_us: how long an osnoise thread will look for noise. + - osnoise/stop_tracing_us: stop the system tracing if a single noise + higher than the configured value happens. Writing 0 disables this + option. + - osnoise/stop_tracing_total_us: stop the system tracing if total noise + higher than the configured value happens. Writing 0 disables this + option. + - tracing_threshold: the minimum delta between two time() reads to be + considered as noise, in us. When set to 0, the default value will + will be used, which is currently 5 us. + +Additional Tracing +------------------ + +In addition to the tracer, a set of tracepoints were added to +facilitate the identification of the osnoise source. + + - osnoise:sample_threshold: printed anytime a noise is higher than + the configurable tolerance_ns. + - osnoise:nmi_noise: noise from NMI, including the duration. + - osnoise:irq_noise: noise from an IRQ, including the duration. + - osnoise:softirq_noise: noise from a SoftIRQ, including the + duration. + - osnoise:thread_noise: noise from a thread, including the duration. + +Note that all the values are *net values*. For example, if while osnoise +is running, another thread preempts the osnoise thread, it will start a +thread_noise duration at the start. Then, an IRQ takes place, preempting +the thread_noise, starting a irq_noise. When the IRQ ends its execution, +it will compute its duration, and this duration will be subtracted from +the thread_noise, in such a way as to avoid the double accounting of the +IRQ execution. This logic is valid for all sources of noise. + +Here is one example of the usage of these tracepoints:: + + osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns + osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns + migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns + osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8812 ns interferences 2 + +In this example, a noise sample of 8 microseconds was reported in the last +line, pointing to two interferences. Looking backward in the trace, the +two previous entries were about the migration thread running after a +timer IRQ execution. The first event is not part of the noise because +it took place one millisecond before. + +It is worth noticing that the sum of the duration reported in the +tracepoints is smaller than eight us reported in the sample_threshold. +The reason roots in the overhead of the entry and exit code that happens +before and after any interference execution. This justifies the dual +approach: measuring thread and tracing. diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst new file mode 100644 index 000000000000..c7cbb557aee7 --- /dev/null +++ b/Documentation/trace/timerlat-tracer.rst @@ -0,0 +1,181 @@ +############### +Timerlat tracer +############### + +The timerlat tracer aims to help the preemptive kernel developers to +find souces of wakeup latencies of real-time threads. Like cyclictest, +the tracer sets a periodic timer that wakes up a thread. The thread then +computes a *wakeup latency* value as the difference between the *current +time* and the *absolute time* that the timer was set to expire. The main +goal of timerlat is tracing in such a way to help kernel developers. + +Usage +----- + +Write the ASCII text "timerlat" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo timerlat > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: timerlat + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # || / + # |||| ACTIVATION + # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY + # | | | |||| | | | | + <idle>-0 [000] d.h1 54.029328: #1 context irq timer_latency 932 ns + <...>-867 [000] .... 54.029339: #1 context thread timer_latency 11700 ns + <idle>-0 [001] dNh1 54.029346: #1 context irq timer_latency 2833 ns + <...>-868 [001] .... 54.029353: #1 context thread timer_latency 9820 ns + <idle>-0 [000] d.h1 54.030328: #2 context irq timer_latency 769 ns + <...>-867 [000] .... 54.030330: #2 context thread timer_latency 3070 ns + <idle>-0 [001] d.h1 54.030344: #2 context irq timer_latency 935 ns + <...>-868 [001] .... 54.030347: #2 context thread timer_latency 4351 ns + + +The tracer creates a per-cpu kernel thread with real-time priority that +prints two lines at every activation. The first is the *timer latency* +observed at the *hardirq* context before the activation of the thread. +The second is the *timer latency* observed by the thread. The ACTIVATION +ID field serves to relate the *irq* execution to its respective *thread* +execution. + +The *irq*/*thread* splitting is important to clarify at which context +the unexpected high value is coming from. The *irq* context can be +delayed by hardware related actions, such as SMIs, NMIs, IRQs +or by a thread masking interrupts. Once the timer happens, the delay +can also be influenced by blocking caused by threads. For example, by +postponing the scheduler execution via preempt_disable(), by the +scheduler execution, or by masking interrupts. Threads can +also be delayed by the interference from other threads and IRQs. + +Tracer options +--------------------- + +The timerlat tracer is built on top of osnoise tracer. +So its configuration is also done in the osnoise/ config +directory. The timerlat configs are: + + - cpus: CPUs at which a timerlat thread will execute. + - timerlat_period_us: the period of the timerlat thread. + - osnoise/stop_tracing_us: stop the system tracing if a + timer latency at the *irq* context higher than the configured + value happens. Writing 0 disables this option. + - stop_tracing_total_us: stop the system tracing if a + timer latency at the *thread* context higher than the configured + value happens. Writing 0 disables this option. + - print_stack: save the stack of the IRQ ocurrence, and print + it afte the *thread context* event". + +timerlat and osnoise +---------------------------- + +The timerlat can also take advantage of the osnoise: traceevents. +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo timerlat > current_tracer + [root@f32 tracing]# echo 1 > events/osnoise/enable + [root@f32 tracing]# echo 25 > osnoise/stop_tracing_total_us + [root@f32 tracing]# tail -10 trace + cc1-87882 [005] d..h... 548.771078: #402268 context irq timer_latency 13585 ns + cc1-87882 [005] dNLh1.. 548.771082: irq_noise: local_timer:236 start 548.771077442 duration 7597 ns + cc1-87882 [005] dNLh2.. 548.771099: irq_noise: qxl:21 start 548.771085017 duration 7139 ns + cc1-87882 [005] d...3.. 548.771102: thread_noise: cc1:87882 start 548.771078243 duration 9909 ns + timerlat/5-1035 [005] ....... 548.771104: #402268 context thread timer_latency 39960 ns + +In this case, the root cause of the timer latency does not point to a +single cause, but to multiple ones. Firstly, the timer IRQ was delayed +for 13 us, which may point to a long IRQ disabled section (see IRQ +stacktrace section). Then the timer interrupt that wakes up the timerlat +thread took 7597 ns, and the qxl:21 device IRQ took 7139 ns. Finally, +the cc1 thread noise took 9909 ns of time before the context switch. +Such pieces of evidence are useful for the developer to use other +tracing methods to figure out how to debug and optimize the system. + +It is worth mentioning that the *duration* values reported +by the osnoise: events are *net* values. For example, the +thread_noise does not include the duration of the overhead caused +by the IRQ execution (which indeed accounted for 12736 ns). But +the values reported by the timerlat tracer (timerlat_latency) +are *gross* values. + +The art below illustrates a CPU timeline and how the timerlat tracer +observes it at the top and the osnoise: events at the bottom. Each "-" +in the timelines means circa 1 us, and the time moves ==>:: + + External timer irq thread + clock latency latency + event 13585 ns 39960 ns + | ^ ^ + v | | + |-------------| | + |-------------+-------------------------| + ^ ^ + ======================================================================== + [tmr irq] [dev irq] + [another thread...^ v..^ v.......][timerlat/ thread] <-- CPU timeline + ========================================================================= + |-------| |-------| + |--^ v-------| + | | | + | | + thread_noise: 9909 ns + | +-> irq_noise: 6139 ns + +-> irq_noise: 7597 ns + +IRQ stacktrace +--------------------------- + +The osnoise/print_stack option is helpful for the cases in which a thread +noise causes the major factor for the timer latency, because of preempt or +irq disabled. For example:: + + [root@f32 tracing]# echo 500 > osnoise/stop_tracing_total_us + [root@f32 tracing]# echo 500 > osnoise/print_stack + [root@f32 tracing]# echo timerlat > current_tracer + [root@f32 tracing]# tail -21 per_cpu/cpu7/trace + insmod-1026 [007] dN.h1.. 200.201948: irq_noise: local_timer:236 start 200.201939376 duration 7872 ns + insmod-1026 [007] d..h1.. 200.202587: #29800 context irq timer_latency 1616 ns + insmod-1026 [007] dN.h2.. 200.202598: irq_noise: local_timer:236 start 200.202586162 duration 11855 ns + insmod-1026 [007] dN.h3.. 200.202947: irq_noise: local_timer:236 start 200.202939174 duration 7318 ns + insmod-1026 [007] d...3.. 200.203444: thread_noise: insmod:1026 start 200.202586933 duration 838681 ns + timerlat/7-1001 [007] ....... 200.203445: #29800 context thread timer_latency 859978 ns + timerlat/7-1001 [007] ....1.. 200.203446: <stack trace> + => timerlat_irq + => __hrtimer_run_queues + => hrtimer_interrupt + => __sysvec_apic_timer_interrupt + => asm_call_irq_on_stack + => sysvec_apic_timer_interrupt + => asm_sysvec_apic_timer_interrupt + => delay_tsc + => dummy_load_1ms_pd_init + => do_one_initcall + => do_init_module + => __do_sys_finit_module + => do_syscall_64 + => entry_SYSCALL_64_after_hwframe + +In this case, it is possible to see that the thread added the highest +contribution to the *timer latency* and the stack trace, saved during +the timerlat IRQ handler, points to a function named +dummy_load_1ms_pd_init, which had the following code (on purpose):: + + static int __init dummy_load_1ms_pd_init(void) + { + preempt_disable(); + mdelay(1); + preempt_enable(); + return 0; + + } |