summaryrefslogtreecommitdiff
path: root/Documentation/trace
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/trace')
-rw-r--r--Documentation/trace/boottime-trace.rst91
-rw-r--r--Documentation/trace/coresight/coresight-config.rst244
-rw-r--r--Documentation/trace/coresight/coresight-etm4x-reference.rst2
-rw-r--r--Documentation/trace/coresight/coresight.rst23
-rw-r--r--Documentation/trace/ftrace.rst8
-rw-r--r--Documentation/trace/histogram.rst112
-rw-r--r--Documentation/trace/hwlat_detector.rst13
-rw-r--r--Documentation/trace/index.rst2
-rw-r--r--Documentation/trace/kprobes.rst24
-rw-r--r--Documentation/trace/osnoise-tracer.rst152
-rw-r--r--Documentation/trace/timerlat-tracer.rst181
11 files changed, 800 insertions, 52 deletions
diff --git a/Documentation/trace/boottime-trace.rst b/Documentation/trace/boottime-trace.rst
index 89b64334929b..6dcfbc64014d 100644
--- a/Documentation/trace/boottime-trace.rst
+++ b/Documentation/trace/boottime-trace.rst
@@ -99,6 +99,12 @@ These options are setting per-event options.
ftrace.[instance.INSTANCE.]event.GROUP.EVENT.enable
Enable GROUP:EVENT tracing.
+ftrace.[instance.INSTANCE.]event.GROUP.enable
+ Enable all event tracing within GROUP.
+
+ftrace.[instance.INSTANCE.]event.enable
+ Enable all event tracing.
+
ftrace.[instance.INSTANCE.]event.GROUP.EVENT.filter = FILTER
Set FILTER rule to the GROUP:EVENT.
@@ -119,6 +125,71 @@ Note that kprobe and synthetic event definitions can be written under
instance node, but those are also visible from other instances. So please
take care for event name conflict.
+Ftrace Histogram Options
+------------------------
+
+Since it is too long to write a histogram action as a string for per-event
+action option, there are tree-style options under per-event 'hist' subkey
+for the histogram actions. For the detail of the each parameter,
+please read the event histogram document [3]_.
+
+.. [3] See :ref:`Documentation/trace/histogram.rst <histogram>`
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]keys = KEY1[, KEY2[...]]
+ Set histogram key parameters. (Mandatory)
+ The 'N' is a digit string for the multiple histogram. You can omit it
+ if there is one histogram on the event.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]values = VAL1[, VAL2[...]]
+ Set histogram value parameters.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]sort = SORT1[, SORT2[...]]
+ Set histogram sort parameter options.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]size = NR_ENTRIES
+ Set histogram size (number of entries).
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]name = NAME
+ Set histogram name.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]var.VARIABLE = EXPR
+ Define a new VARIABLE by EXPR expression.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]<pause|continue|clear>
+ Set histogram control parameter. You can set one of them.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmatch.[M.]event = GROUP.EVENT
+ Set histogram 'onmatch' handler matching event parameter.
+ The 'M' is a digit string for the multiple 'onmatch' handler. You can omit it
+ if there is one 'onmatch' handler on this histogram.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmatch.[M.]trace = EVENT[, ARG1[...]]
+ Set histogram 'trace' action for 'onmatch'.
+ EVENT must be a synthetic event name, and ARG1... are parameters
+ for that event. Mandatory if 'onmatch.event' option is set.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmax.[M.]var = VAR
+ Set histogram 'onmax' handler variable parameter.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onchange.[M.]var = VAR
+ Set histogram 'onchange' handler variable parameter.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]<onmax|onchange>.[M.]save = ARG1[, ARG2[...]]
+ Set histogram 'save' action parameters for 'onmax' or 'onchange' handler.
+ This option or below 'snapshot' option is mandatory if 'onmax.var' or
+ 'onchange.var' option is set.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]<onmax|onchange>.[M.]snapshot
+ Set histogram 'snapshot' action for 'onmax' or 'onchange' handler.
+ This option or above 'save' option is mandatory if 'onmax.var' or
+ 'onchange.var' option is set.
+
+ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.filter = FILTER_EXPR
+ Set histogram filter expression. You don't need 'if' in the FILTER_EXPR.
+
+Note that this 'hist' option can conflict with the per-event 'actions'
+option if the 'actions' option has a histogram action.
+
When to Start
=============
@@ -153,13 +224,23 @@ below::
}
synthetic.initcall_latency {
fields = "unsigned long func", "u64 lat"
- actions = "hist:keys=func.sym,lat:vals=lat:sort=lat"
+ hist {
+ keys = func.sym, lat
+ values = lat
+ sort = lat
+ }
}
- initcall.initcall_start {
- actions = "hist:keys=func:ts0=common_timestamp.usecs"
+ initcall.initcall_start.hist {
+ keys = func
+ var.ts0 = common_timestamp.usecs
}
- initcall.initcall_finish {
- actions = "hist:keys=func:lat=common_timestamp.usecs-$ts0:onmatch(initcall.initcall_start).initcall_latency(func,$lat)"
+ initcall.initcall_finish.hist {
+ keys = func
+ var.lat = common_timestamp.usecs - $ts0
+ onmatch {
+ event = initcall.initcall_start
+ trace = initcall_latency, func, $lat
+ }
}
}
diff --git a/Documentation/trace/coresight/coresight-config.rst b/Documentation/trace/coresight/coresight-config.rst
new file mode 100644
index 000000000000..a4e3ef295240
--- /dev/null
+++ b/Documentation/trace/coresight/coresight-config.rst
@@ -0,0 +1,244 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+CoreSight System Configuration Manager
+======================================
+
+ :Author: Mike Leach <mike.leach@linaro.org>
+ :Date: October 2020
+
+Introduction
+============
+
+The CoreSight System Configuration manager is an API that allows the
+programming of the CoreSight system with pre-defined configurations that
+can then be easily enabled from sysfs or perf.
+
+Many CoreSight components can be programmed in complex ways - especially ETMs.
+In addition, components can interact across the CoreSight system, often via
+the cross trigger components such as CTI and CTM. These system settings can
+be defined and enabled as named configurations.
+
+
+Basic Concepts
+==============
+
+This section introduces the basic concepts of a CoreSight system configuration.
+
+
+Features
+--------
+
+A feature is a named set of programming for a CoreSight device. The programming
+is device dependent, and can be defined in terms of absolute register values,
+resource usage and parameter values.
+
+The feature is defined using a descriptor. This descriptor is used to load onto
+a matching device, either when the feature is loaded into the system, or when the
+CoreSight device is registered with the configuration manager.
+
+The load process involves interpreting the descriptor into a set of register
+accesses in the driver - the resource usage and parameter descriptions
+translated into appropriate register accesses. This interpretation makes it easy
+and efficient for the feature to be programmed onto the device when required.
+
+The feature will not be active on the device until the feature is enabled, and
+the device itself is enabled. When the device is enabled then enabled features
+will be programmed into the device hardware.
+
+A feature is enabled as part of a configuration being enabled on the system.
+
+
+Parameter Value
+~~~~~~~~~~~~~~~
+
+A parameter value is a named value that may be set by the user prior to the
+feature being enabled that can adjust the behaviour of the operation programmed
+by the feature.
+
+For example, this could be a count value in a programmed operation that repeats
+at a given rate. When the feature is enabled then the current value of the
+parameter is used in programming the device.
+
+The feature descriptor defines a default value for a parameter, which is used
+if the user does not supply a new value.
+
+Users can update parameter values using the configfs API for the CoreSight
+system - which is described below.
+
+The current value of the parameter is loaded into the device when the feature
+is enabled on that device.
+
+
+Configurations
+--------------
+
+A configuration defines a set of features that are to be used in a trace
+session where the configuration is selected. For any trace session only one
+configuration may be selected.
+
+The features defined may be on any type of device that is registered
+to support system configuration. A configuration may select features to be
+enabled on a class of devices - i.e. any ETMv4, or specific devices, e.g. a
+specific CTI on the system.
+
+As with the feature, a descriptor is used to define the configuration.
+This will define the features that must be enabled as part of the configuration
+as well as any preset values that can be used to override default parameter
+values.
+
+
+Preset Values
+~~~~~~~~~~~~~
+
+Preset values are easily selectable sets of parameter values for the features
+that the configuration uses. The number of values in a single preset set, equals
+the sum of parameter values in the features used by the configuration.
+
+e.g. a configuration consists of 3 features, one has 2 parameters, one has
+a single parameter, and another has no parameters. A single preset set will
+therefore have 3 values.
+
+Presets are optionally defined by the configuration, up to 15 can be defined.
+If no preset is selected, then the parameter values defined in the feature
+are used as normal.
+
+
+Operation
+~~~~~~~~~
+
+The following steps take place in the operation of a configuration.
+
+1) In this example, the configuration is 'autofdo', which has an
+ associated feature 'strobing' that works on ETMv4 CoreSight Devices.
+
+2) The configuration is enabled. For example 'perf' may select the
+ configuration as part of its command line::
+
+ perf record -e cs_etm/autofdo/ myapp
+
+ which will enable the 'autofdo' configuration.
+
+3) perf starts tracing on the system. As each ETMv4 that perf uses for
+ trace is enabled, the configuration manager will check if the ETMv4
+ has a feature that relates to the currently active configuration.
+ In this case 'strobing' is enabled & programmed into the ETMv4.
+
+4) When the ETMv4 is disabled, any registers marked as needing to be
+ saved will be read back.
+
+5) At the end of the perf session, the configuration will be disabled.
+
+
+Viewing Configurations and Features
+===================================
+
+The set of configurations and features that are currently loaded into the
+system can be viewed using the configfs API.
+
+Mount configfs as normal and the 'cs-syscfg' subsystem will appear::
+
+ $ ls /config
+ cs-syscfg stp-policy
+
+This has two sub-directories::
+
+ $ cd cs-syscfg/
+ $ ls
+ configurations features
+
+The system has the configuration 'autofdo' built in. It may be examined as
+follows::
+
+ $ cd configurations/
+ $ ls
+ autofdo
+ $ cd autofdo/
+ $ ls
+ description preset1 preset3 preset5 preset7 preset9
+ feature_refs preset2 preset4 preset6 preset8
+ $ cat description
+ Setup ETMs with strobing for autofdo
+ $ cat feature_refs
+ strobing
+
+Each preset declared has a preset<n> subdirectory declared. The values for
+the preset can be examined::
+
+ $ cat preset1/values
+ strobing.window = 0x1388 strobing.period = 0x2
+ $ cat preset2/values
+ strobing.window = 0x1388 strobing.period = 0x4
+
+The features referenced by the configuration can be examined in the features
+directory::
+
+ $ cd ../../features/strobing/
+ $ ls
+ description matches nr_params params
+ $ cat description
+ Generate periodic trace capture windows.
+ parameter 'window': a number of CPU cycles (W)
+ parameter 'period': trace enabled for W cycles every period x W cycles
+ $ cat matches
+ SRC_ETMV4
+ $ cat nr_params
+ 2
+
+Move to the params directory to examine and adjust parameters::
+
+ cd params
+ $ ls
+ period window
+ $ cd period
+ $ ls
+ value
+ $ cat value
+ 0x2710
+ # echo 15000 > value
+ # cat value
+ 0x3a98
+
+Parameters adjusted in this way are reflected in all device instances that have
+loaded the feature.
+
+
+Using Configurations in perf
+============================
+
+The configurations loaded into the CoreSight configuration management are
+also declared in the perf 'cs_etm' event infrastructure so that they can
+be selected when running trace under perf::
+
+ $ ls /sys/devices/cs_etm
+ configurations format perf_event_mux_interval_ms sinks type
+ events nr_addr_filters power
+
+Key directories here are 'configurations' - which lists the loaded
+configurations, and 'events' - a generic perf directory which allows
+selection on the perf command line.::
+
+ $ ls configurations/
+ autofdo
+ $ cat configurations/autofdo
+ 0xa7c3dddd
+
+As with the sinks entries, this provides a hash of the configuration name.
+The entry in the 'events' directory uses perfs built in syntax generator
+to substitute the syntax for the name when evaluating the command::
+
+ $ ls events/
+ autofdo
+ $ cat events/autofdo
+ configid=0xa7c3dddd
+
+The 'autofdo' configuration may be selected on the perf command line::
+
+ $ perf record -e cs_etm/autofdo/u --per-thread <application>
+
+A preset to override the current parameter values can also be selected::
+
+ $ perf record -e cs_etm/autofdo,preset=1/u --per-thread <application>
+
+When configurations are selected in this way, then the trace sink used is
+automatically selected.
diff --git a/Documentation/trace/coresight/coresight-etm4x-reference.rst b/Documentation/trace/coresight/coresight-etm4x-reference.rst
index b64d9a9c79df..d25dfe86af9b 100644
--- a/Documentation/trace/coresight/coresight-etm4x-reference.rst
+++ b/Documentation/trace/coresight/coresight-etm4x-reference.rst
@@ -427,7 +427,7 @@ the ‘TRC’ prefix.
:Syntax:
``echo idx > vmid_idx``
- Where idx <  numvmidc
+ Where idx < numvmidc
----
diff --git a/Documentation/trace/coresight/coresight.rst b/Documentation/trace/coresight/coresight.rst
index 169749efd8d1..a15571d96cc8 100644
--- a/Documentation/trace/coresight/coresight.rst
+++ b/Documentation/trace/coresight/coresight.rst
@@ -315,7 +315,8 @@ intermediate links as required.
Note: ``cti_sys0`` appears in two of the connections lists above.
CTIs can connect to multiple devices and are arranged in a star topology
-via the CTM. See (:doc:`coresight-ect`) [#fourth]_ for further details.
+via the CTM. See (Documentation/trace/coresight/coresight-ect.rst)
+[#fourth]_ for further details.
Looking at this device we see 4 connections::
linaro-developer:~# ls -l /sys/bus/coresight/devices/cti_sys0/connections
@@ -606,7 +607,8 @@ interface provided for that purpose by the generic STM API::
crw------- 1 root root 10, 61 Jan 3 18:11 /dev/stm0
root@genericarmv8:~#
-Details on how to use the generic STM API can be found here:- :doc:`../stm` [#second]_.
+Details on how to use the generic STM API can be found here:
+- Documentation/trace/stm.rst [#second]_.
The CTI & CTM Modules
---------------------
@@ -616,7 +618,20 @@ individual CTIs and components, and can propagate these between all CTIs via
channels on the CTM (Cross Trigger Matrix).
A separate documentation file is provided to explain the use of these devices.
-(:doc:`coresight-ect`) [#fourth]_.
+(Documentation/trace/coresight/coresight-ect.rst) [#fourth]_.
+
+CoreSight System Configuration
+------------------------------
+
+CoreSight components can be complex devices with many programming options.
+Furthermore, components can be programmed to interact with each other across the
+complete system.
+
+A CoreSight System Configuration manager is provided to allow these complex programming
+configurations to be selected and used easily from perf and sysfs.
+
+See the separate document for further information.
+(Documentation/trace/coresight/coresight-config.rst) [#fifth]_.
.. [#first] Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
@@ -626,3 +641,5 @@ A separate documentation file is provided to explain the use of these devices.
.. [#third] https://github.com/Linaro/perf-opencsd
.. [#fourth] Documentation/trace/coresight/coresight-ect.rst
+
+.. [#fifth] Documentation/trace/coresight/coresight-config.rst
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 62c98e9bbdd9..4e5b26f03d5b 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -40,7 +40,7 @@ See events.rst for more information.
Implementation Details
----------------------
-See :doc:`ftrace-design` for details for arch porters and such.
+See Documentation/trace/ftrace-design.rst for details for arch porters and such.
The File System
@@ -354,8 +354,8 @@ of ftrace. Here is a list of some of the key files:
is being directly called by the function. If the count is greater
than 1 it most likely will be ftrace_ops_list_func().
- If the callback of the function jumps to a trampoline that is
- specific to a the callback and not the standard trampoline,
+ If the callback of a function jumps to a trampoline that is
+ specific to the callback and which is not the standard trampoline,
its address will be printed as well as the function that the
trampoline calls.
@@ -2762,7 +2762,7 @@ listed in:
put_prev_task_idle
kmem_cache_create
pick_next_task_rt
- get_online_cpus
+ cpus_read_lock
pick_next_task_fair
mutex_lock
[...]
diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst
index b71e09f745c3..533415644c54 100644
--- a/Documentation/trace/histogram.rst
+++ b/Documentation/trace/histogram.rst
@@ -70,15 +70,16 @@ Documentation written by Tom Zanussi
modified by appending any of the following modifiers to the field
name:
- =========== ==========================================
- .hex display a number as a hex value
- .sym display an address as a symbol
- .sym-offset display an address as a symbol and offset
- .syscall display a syscall id as a system call name
- .execname display a common_pid as a program name
- .log2 display log2 value rather than raw number
- .usecs display a common_timestamp in microseconds
- =========== ==========================================
+ ============= =================================================
+ .hex display a number as a hex value
+ .sym display an address as a symbol
+ .sym-offset display an address as a symbol and offset
+ .syscall display a syscall id as a system call name
+ .execname display a common_pid as a program name
+ .log2 display log2 value rather than raw number
+ .buckets=size display grouping of values rather than raw number
+ .usecs display a common_timestamp in microseconds
+ ============= =================================================
Note that in general the semantics of a given field aren't
interpreted when applying a modifier to it, but there are some
@@ -191,7 +192,7 @@ Documentation written by Tom Zanussi
with the event, in nanoseconds. May be
modified by .usecs to have timestamps
interpreted as microseconds.
- cpu int the cpu on which the event occurred.
+ common_cpu int the cpu on which the event occurred.
====================== ==== =======================================
Extended error information
@@ -228,7 +229,7 @@ Extended error information
that lists the total number of bytes requested for each function in
the kernel that made one or more calls to kmalloc::
- # echo 'hist:key=call_site:val=bytes_req' > \
+ # echo 'hist:key=call_site:val=bytes_req.buckets=32' > \
/sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
This tells the tracing system to create a 'hist' trigger using the
@@ -1823,20 +1824,99 @@ and variables defined on other events (see Section 2.2.3 below on
how that is done using hist trigger 'onmatch' action). Once that is
done, the 'wakeup_latency' synthetic event instance is created.
-A histogram can now be defined for the new synthetic event::
-
- # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
- /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
-
The new event is created under the tracing/events/synthetic/ directory
and looks and behaves just like any other event::
# ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
enable filter format hist id trigger
+A histogram can now be defined for the new synthetic event::
+
+ # echo 'hist:keys=pid,prio,lat.log2:sort=lat' >> \
+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
+
+The above shows the latency "lat" in a power of 2 grouping.
+
Like any other event, once a histogram is enabled for the event, the
output can be displayed by reading the event's 'hist' file.
+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
+
+ # event histogram
+ #
+ # trigger info: hist:keys=pid,prio,lat.log2:vals=hitcount:sort=lat.log2:size=2048 [active]
+ #
+
+ { pid: 2035, prio: 9, lat: ~ 2^2 } hitcount: 43
+ { pid: 2034, prio: 9, lat: ~ 2^2 } hitcount: 60
+ { pid: 2029, prio: 9, lat: ~ 2^2 } hitcount: 965
+ { pid: 2034, prio: 120, lat: ~ 2^2 } hitcount: 9
+ { pid: 2033, prio: 120, lat: ~ 2^2 } hitcount: 5
+ { pid: 2030, prio: 9, lat: ~ 2^2 } hitcount: 335
+ { pid: 2030, prio: 120, lat: ~ 2^2 } hitcount: 10
+ { pid: 2032, prio: 120, lat: ~ 2^2 } hitcount: 1
+ { pid: 2035, prio: 120, lat: ~ 2^2 } hitcount: 2
+ { pid: 2031, prio: 9, lat: ~ 2^2 } hitcount: 176
+ { pid: 2028, prio: 120, lat: ~ 2^2 } hitcount: 15
+ { pid: 2033, prio: 9, lat: ~ 2^2 } hitcount: 91
+ { pid: 2032, prio: 9, lat: ~ 2^2 } hitcount: 125
+ { pid: 2029, prio: 120, lat: ~ 2^2 } hitcount: 4
+ { pid: 2031, prio: 120, lat: ~ 2^2 } hitcount: 3
+ { pid: 2029, prio: 120, lat: ~ 2^3 } hitcount: 2
+ { pid: 2035, prio: 9, lat: ~ 2^3 } hitcount: 41
+ { pid: 2030, prio: 120, lat: ~ 2^3 } hitcount: 1
+ { pid: 2032, prio: 9, lat: ~ 2^3 } hitcount: 32
+ { pid: 2031, prio: 9, lat: ~ 2^3 } hitcount: 44
+ { pid: 2034, prio: 9, lat: ~ 2^3 } hitcount: 40
+ { pid: 2030, prio: 9, lat: ~ 2^3 } hitcount: 29
+ { pid: 2033, prio: 9, lat: ~ 2^3 } hitcount: 31
+ { pid: 2029, prio: 9, lat: ~ 2^3 } hitcount: 31
+ { pid: 2028, prio: 120, lat: ~ 2^3 } hitcount: 18
+ { pid: 2031, prio: 120, lat: ~ 2^3 } hitcount: 2
+ { pid: 2028, prio: 120, lat: ~ 2^4 } hitcount: 1
+ { pid: 2029, prio: 9, lat: ~ 2^4 } hitcount: 4
+ { pid: 2031, prio: 120, lat: ~ 2^7 } hitcount: 1
+ { pid: 2032, prio: 120, lat: ~ 2^7 } hitcount: 1
+
+ Totals:
+ Hits: 2122
+ Entries: 30
+ Dropped: 0
+
+
+The latency values can also be grouped linearly by a given size with
+the ".buckets" modifier and specify a size (in this case groups of 10).
+
+ # echo 'hist:keys=pid,prio,lat.buckets=10:sort=lat' >> \
+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
+
+ # event histogram
+ #
+ # trigger info: hist:keys=pid,prio,lat.buckets=10:vals=hitcount:sort=lat.buckets=10:size=2048 [active]
+ #
+
+ { pid: 2067, prio: 9, lat: ~ 0-9 } hitcount: 220
+ { pid: 2068, prio: 9, lat: ~ 0-9 } hitcount: 157
+ { pid: 2070, prio: 9, lat: ~ 0-9 } hitcount: 100
+ { pid: 2067, prio: 120, lat: ~ 0-9 } hitcount: 6
+ { pid: 2065, prio: 120, lat: ~ 0-9 } hitcount: 2
+ { pid: 2066, prio: 120, lat: ~ 0-9 } hitcount: 2
+ { pid: 2069, prio: 9, lat: ~ 0-9 } hitcount: 122
+ { pid: 2069, prio: 120, lat: ~ 0-9 } hitcount: 8
+ { pid: 2070, prio: 120, lat: ~ 0-9 } hitcount: 1
+ { pid: 2068, prio: 120, lat: ~ 0-9 } hitcount: 7
+ { pid: 2066, prio: 9, lat: ~ 0-9 } hitcount: 365
+ { pid: 2064, prio: 120, lat: ~ 0-9 } hitcount: 35
+ { pid: 2065, prio: 9, lat: ~ 0-9 } hitcount: 998
+ { pid: 2071, prio: 9, lat: ~ 0-9 } hitcount: 85
+ { pid: 2065, prio: 9, lat: ~ 10-19 } hitcount: 2
+ { pid: 2064, prio: 120, lat: ~ 10-19 } hitcount: 2
+
+ Totals:
+ Hits: 2112
+ Entries: 16
+ Dropped: 0
+
2.2.3 Hist trigger 'handlers' and 'actions'
-------------------------------------------
diff --git a/Documentation/trace/hwlat_detector.rst b/Documentation/trace/hwlat_detector.rst
index 5739349649c8..de94b499b0bc 100644
--- a/Documentation/trace/hwlat_detector.rst
+++ b/Documentation/trace/hwlat_detector.rst
@@ -76,8 +76,13 @@ in /sys/kernel/tracing:
- tracing_cpumask - the CPUs to move the hwlat thread across
- hwlat_detector/width - specified amount of time to spin within window (usecs)
- hwlat_detector/window - amount of time between (width) runs (usecs)
+ - hwlat_detector/mode - the thread mode
-The hwlat detector's kernel thread will migrate across each CPU specified in
-tracing_cpumask between each window. To limit the migration, either modify
-tracing_cpumask, or modify the hwlat kernel thread (named [hwlatd]) CPU
-affinity directly, and the migration will stop.
+By default, one hwlat detector's kernel thread will migrate across each CPU
+specified in cpumask at the beginning of a new window, in a round-robin
+fashion. This behavior can be changed by changing the thread mode,
+the available options are:
+
+ - none: do not force migration
+ - round-robin: migrate across each CPU specified in cpumask [default]
+ - per-cpu: create one thread for each cpu in tracing_cpumask
diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst
index f634b36fd3aa..3769b9b7aed8 100644
--- a/Documentation/trace/index.rst
+++ b/Documentation/trace/index.rst
@@ -23,6 +23,8 @@ Linux Tracing Technologies
histogram-design
boottime-trace
hwlat_detector
+ osnoise-tracer
+ timerlat-tracer
intel_th
ring-buffer-design
stm
diff --git a/Documentation/trace/kprobes.rst b/Documentation/trace/kprobes.rst
index b757b6dfd3d4..998149ce2fd9 100644
--- a/Documentation/trace/kprobes.rst
+++ b/Documentation/trace/kprobes.rst
@@ -362,14 +362,11 @@ register_kprobe
#include <linux/kprobes.h>
int register_kprobe(struct kprobe *kp);
-Sets a breakpoint at the address kp->addr. When the breakpoint is
-hit, Kprobes calls kp->pre_handler. After the probed instruction
-is single-stepped, Kprobe calls kp->post_handler. If a fault
-occurs during execution of kp->pre_handler or kp->post_handler,
-or during single-stepping of the probed instruction, Kprobes calls
-kp->fault_handler. Any or all handlers can be NULL. If kp->flags
-is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
-so, its handlers aren't hit until calling enable_kprobe(kp).
+Sets a breakpoint at the address kp->addr. When the breakpoint is hit, Kprobes
+calls kp->pre_handler. After the probed instruction is single-stepped, Kprobe
+calls kp->post_handler. Any or all handlers can be NULL. If kp->flags is set
+KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, its handlers
+aren't hit until calling enable_kprobe(kp).
.. note::
@@ -415,17 +412,6 @@ User's post-handler (kp->post_handler)::
p and regs are as described for the pre_handler. flags always seems
to be zero.
-User's fault-handler (kp->fault_handler)::
-
- #include <linux/kprobes.h>
- #include <linux/ptrace.h>
- int fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr);
-
-p and regs are as described for the pre_handler. trapnr is the
-architecture-specific trap number associated with the fault (e.g.,
-on i386, 13 for a general protection fault or 14 for a page fault).
-Returns 1 if it successfully handled the exception.
-
register_kretprobe
------------------
diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst
new file mode 100644
index 000000000000..b648cb9bf1f0
--- /dev/null
+++ b/Documentation/trace/osnoise-tracer.rst
@@ -0,0 +1,152 @@
+==============
+OSNOISE Tracer
+==============
+
+In the context of high-performance computing (HPC), the Operating System
+Noise (*osnoise*) refers to the interference experienced by an application
+due to activities inside the operating system. In the context of Linux,
+NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the
+system. Moreover, hardware-related jobs can also cause noise, for example,
+via SMIs.
+
+hwlat_detector is one of the tools used to identify the most complex
+source of noise: *hardware noise*.
+
+In a nutshell, the hwlat_detector creates a thread that runs
+periodically for a given period. At the beginning of a period, the thread
+disables interrupt and starts sampling. While running, the hwlatd
+thread reads the time in a loop. As interrupts are disabled, threads,
+IRQs, and SoftIRQs cannot interfere with the hwlatd thread. Hence, the
+cause of any gap between two different reads of the time roots either on
+NMI or in the hardware itself. At the end of the period, hwlatd enables
+interrupts and reports the max observed gap between the reads. It also
+prints a NMI occurrence counter. If the output does not report NMI
+executions, the user can conclude that the hardware is the culprit for
+the latency. The hwlat detects the NMI execution by observing
+the entry and exit of a NMI.
+
+The osnoise tracer leverages the hwlat_detector by running a
+similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing
+all the sources of *osnoise* during its execution. Using the same approach
+of hwlat, osnoise takes note of the entry and exit point of any
+source of interferences, increasing a per-cpu interference counter. The
+osnoise tracer also saves an interference counter for each source of
+interference. The interference counter for NMI, IRQs, SoftIRQs, and
+threads is increased anytime the tool observes these interferences' entry
+events. When a noise happens without any interference from the operating
+system level, the hardware noise counter increases, pointing to a
+hardware-related noise. In this way, osnoise can account for any
+source of interference. At the end of the period, the osnoise tracer
+prints the sum of all noise, the max single noise, the percentage of CPU
+available for the thread, and the counters for the noise sources.
+
+Usage
+-----
+
+Write the ASCII text "osnoise" into the current_tracer file of the
+tracing system (generally mounted at /sys/kernel/tracing).
+
+For example::
+
+ [root@f32 ~]# cd /sys/kernel/tracing/
+ [root@f32 tracing]# echo osnoise > current_tracer
+
+It is possible to follow the trace by reading the trace trace file::
+
+ [root@f32 tracing]# cat trace
+ # tracer: osnoise
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth MAX
+ # || / SINGLE Interference counters:
+ # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+
+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD
+ # | | | |||| | | | | | | | | | |
+ <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1
+ <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3
+ <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21
+ <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0
+ <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41
+ <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2
+ <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1
+ <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19
+
+In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the
+tracer prints a message at the end of each period for each CPU that is
+running an osnoise/ thread. The osnoise specific fields report:
+
+ - The RUNTIME IN US reports the amount of time in microseconds that
+ the osnoise thread kept looping reading the time.
+ - The NOISE IN US reports the sum of noise in microseconds observed
+ by the osnoise tracer during the associated runtime.
+ - The % OF CPU AVAILABLE reports the percentage of CPU available for
+ the osnoise thread during the runtime window.
+ - The MAX SINGLE NOISE IN US reports the maximum single noise observed
+ during the runtime window.
+ - The Interference counters display how many each of the respective
+ interference happened during the runtime window.
+
+Note that the example above shows a high number of HW noise samples.
+The reason being is that this sample was taken on a virtual machine,
+and the host interference is detected as a hardware interference.
+
+Tracer options
+---------------------
+
+The tracer has a set of options inside the osnoise directory, they are:
+
+ - osnoise/cpus: CPUs at which a osnoise thread will execute.
+ - osnoise/period_us: the period of the osnoise thread.
+ - osnoise/runtime_us: how long an osnoise thread will look for noise.
+ - osnoise/stop_tracing_us: stop the system tracing if a single noise
+ higher than the configured value happens. Writing 0 disables this
+ option.
+ - osnoise/stop_tracing_total_us: stop the system tracing if total noise
+ higher than the configured value happens. Writing 0 disables this
+ option.
+ - tracing_threshold: the minimum delta between two time() reads to be
+ considered as noise, in us. When set to 0, the default value will
+ will be used, which is currently 5 us.
+
+Additional Tracing
+------------------
+
+In addition to the tracer, a set of tracepoints were added to
+facilitate the identification of the osnoise source.
+
+ - osnoise:sample_threshold: printed anytime a noise is higher than
+ the configurable tolerance_ns.
+ - osnoise:nmi_noise: noise from NMI, including the duration.
+ - osnoise:irq_noise: noise from an IRQ, including the duration.
+ - osnoise:softirq_noise: noise from a SoftIRQ, including the
+ duration.
+ - osnoise:thread_noise: noise from a thread, including the duration.
+
+Note that all the values are *net values*. For example, if while osnoise
+is running, another thread preempts the osnoise thread, it will start a
+thread_noise duration at the start. Then, an IRQ takes place, preempting
+the thread_noise, starting a irq_noise. When the IRQ ends its execution,
+it will compute its duration, and this duration will be subtracted from
+the thread_noise, in such a way as to avoid the double accounting of the
+IRQ execution. This logic is valid for all sources of noise.
+
+Here is one example of the usage of these tracepoints::
+
+ osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns
+ osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns
+ migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns
+ osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8812 ns interferences 2
+
+In this example, a noise sample of 8 microseconds was reported in the last
+line, pointing to two interferences. Looking backward in the trace, the
+two previous entries were about the migration thread running after a
+timer IRQ execution. The first event is not part of the noise because
+it took place one millisecond before.
+
+It is worth noticing that the sum of the duration reported in the
+tracepoints is smaller than eight us reported in the sample_threshold.
+The reason roots in the overhead of the entry and exit code that happens
+before and after any interference execution. This justifies the dual
+approach: measuring thread and tracing.
diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst
new file mode 100644
index 000000000000..c7cbb557aee7
--- /dev/null
+++ b/Documentation/trace/timerlat-tracer.rst
@@ -0,0 +1,181 @@
+###############
+Timerlat tracer
+###############
+
+The timerlat tracer aims to help the preemptive kernel developers to
+find souces of wakeup latencies of real-time threads. Like cyclictest,
+the tracer sets a periodic timer that wakes up a thread. The thread then
+computes a *wakeup latency* value as the difference between the *current
+time* and the *absolute time* that the timer was set to expire. The main
+goal of timerlat is tracing in such a way to help kernel developers.
+
+Usage
+-----
+
+Write the ASCII text "timerlat" into the current_tracer file of the
+tracing system (generally mounted at /sys/kernel/tracing).
+
+For example::
+
+ [root@f32 ~]# cd /sys/kernel/tracing/
+ [root@f32 tracing]# echo timerlat > current_tracer
+
+It is possible to follow the trace by reading the trace trace file::
+
+ [root@f32 tracing]# cat trace
+ # tracer: timerlat
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # || /
+ # |||| ACTIVATION
+ # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY
+ # | | | |||| | | | |
+ <idle>-0 [000] d.h1 54.029328: #1 context irq timer_latency 932 ns
+ <...>-867 [000] .... 54.029339: #1 context thread timer_latency 11700 ns
+ <idle>-0 [001] dNh1 54.029346: #1 context irq timer_latency 2833 ns
+ <...>-868 [001] .... 54.029353: #1 context thread timer_latency 9820 ns
+ <idle>-0 [000] d.h1 54.030328: #2 context irq timer_latency 769 ns
+ <...>-867 [000] .... 54.030330: #2 context thread timer_latency 3070 ns
+ <idle>-0 [001] d.h1 54.030344: #2 context irq timer_latency 935 ns
+ <...>-868 [001] .... 54.030347: #2 context thread timer_latency 4351 ns
+
+
+The tracer creates a per-cpu kernel thread with real-time priority that
+prints two lines at every activation. The first is the *timer latency*
+observed at the *hardirq* context before the activation of the thread.
+The second is the *timer latency* observed by the thread. The ACTIVATION
+ID field serves to relate the *irq* execution to its respective *thread*
+execution.
+
+The *irq*/*thread* splitting is important to clarify at which context
+the unexpected high value is coming from. The *irq* context can be
+delayed by hardware related actions, such as SMIs, NMIs, IRQs
+or by a thread masking interrupts. Once the timer happens, the delay
+can also be influenced by blocking caused by threads. For example, by
+postponing the scheduler execution via preempt_disable(), by the
+scheduler execution, or by masking interrupts. Threads can
+also be delayed by the interference from other threads and IRQs.
+
+Tracer options
+---------------------
+
+The timerlat tracer is built on top of osnoise tracer.
+So its configuration is also done in the osnoise/ config
+directory. The timerlat configs are:
+
+ - cpus: CPUs at which a timerlat thread will execute.
+ - timerlat_period_us: the period of the timerlat thread.
+ - osnoise/stop_tracing_us: stop the system tracing if a
+ timer latency at the *irq* context higher than the configured
+ value happens. Writing 0 disables this option.
+ - stop_tracing_total_us: stop the system tracing if a
+ timer latency at the *thread* context higher than the configured
+ value happens. Writing 0 disables this option.
+ - print_stack: save the stack of the IRQ ocurrence, and print
+ it afte the *thread context* event".
+
+timerlat and osnoise
+----------------------------
+
+The timerlat can also take advantage of the osnoise: traceevents.
+For example::
+
+ [root@f32 ~]# cd /sys/kernel/tracing/
+ [root@f32 tracing]# echo timerlat > current_tracer
+ [root@f32 tracing]# echo 1 > events/osnoise/enable
+ [root@f32 tracing]# echo 25 > osnoise/stop_tracing_total_us
+ [root@f32 tracing]# tail -10 trace
+ cc1-87882 [005] d..h... 548.771078: #402268 context irq timer_latency 13585 ns
+ cc1-87882 [005] dNLh1.. 548.771082: irq_noise: local_timer:236 start 548.771077442 duration 7597 ns
+ cc1-87882 [005] dNLh2.. 548.771099: irq_noise: qxl:21 start 548.771085017 duration 7139 ns
+ cc1-87882 [005] d...3.. 548.771102: thread_noise: cc1:87882 start 548.771078243 duration 9909 ns
+ timerlat/5-1035 [005] ....... 548.771104: #402268 context thread timer_latency 39960 ns
+
+In this case, the root cause of the timer latency does not point to a
+single cause, but to multiple ones. Firstly, the timer IRQ was delayed
+for 13 us, which may point to a long IRQ disabled section (see IRQ
+stacktrace section). Then the timer interrupt that wakes up the timerlat
+thread took 7597 ns, and the qxl:21 device IRQ took 7139 ns. Finally,
+the cc1 thread noise took 9909 ns of time before the context switch.
+Such pieces of evidence are useful for the developer to use other
+tracing methods to figure out how to debug and optimize the system.
+
+It is worth mentioning that the *duration* values reported
+by the osnoise: events are *net* values. For example, the
+thread_noise does not include the duration of the overhead caused
+by the IRQ execution (which indeed accounted for 12736 ns). But
+the values reported by the timerlat tracer (timerlat_latency)
+are *gross* values.
+
+The art below illustrates a CPU timeline and how the timerlat tracer
+observes it at the top and the osnoise: events at the bottom. Each "-"
+in the timelines means circa 1 us, and the time moves ==>::
+
+ External timer irq thread
+ clock latency latency
+ event 13585 ns 39960 ns
+ | ^ ^
+ v | |
+ |-------------| |
+ |-------------+-------------------------|
+ ^ ^
+ ========================================================================
+ [tmr irq] [dev irq]
+ [another thread...^ v..^ v.......][timerlat/ thread] <-- CPU timeline
+ =========================================================================
+ |-------| |-------|
+ |--^ v-------|
+ | | |
+ | | + thread_noise: 9909 ns
+ | +-> irq_noise: 6139 ns
+ +-> irq_noise: 7597 ns
+
+IRQ stacktrace
+---------------------------
+
+The osnoise/print_stack option is helpful for the cases in which a thread
+noise causes the major factor for the timer latency, because of preempt or
+irq disabled. For example::
+
+ [root@f32 tracing]# echo 500 > osnoise/stop_tracing_total_us
+ [root@f32 tracing]# echo 500 > osnoise/print_stack
+ [root@f32 tracing]# echo timerlat > current_tracer
+ [root@f32 tracing]# tail -21 per_cpu/cpu7/trace
+ insmod-1026 [007] dN.h1.. 200.201948: irq_noise: local_timer:236 start 200.201939376 duration 7872 ns
+ insmod-1026 [007] d..h1.. 200.202587: #29800 context irq timer_latency 1616 ns
+ insmod-1026 [007] dN.h2.. 200.202598: irq_noise: local_timer:236 start 200.202586162 duration 11855 ns
+ insmod-1026 [007] dN.h3.. 200.202947: irq_noise: local_timer:236 start 200.202939174 duration 7318 ns
+ insmod-1026 [007] d...3.. 200.203444: thread_noise: insmod:1026 start 200.202586933 duration 838681 ns
+ timerlat/7-1001 [007] ....... 200.203445: #29800 context thread timer_latency 859978 ns
+ timerlat/7-1001 [007] ....1.. 200.203446: <stack trace>
+ => timerlat_irq
+ => __hrtimer_run_queues
+ => hrtimer_interrupt
+ => __sysvec_apic_timer_interrupt
+ => asm_call_irq_on_stack
+ => sysvec_apic_timer_interrupt
+ => asm_sysvec_apic_timer_interrupt
+ => delay_tsc
+ => dummy_load_1ms_pd_init
+ => do_one_initcall
+ => do_init_module
+ => __do_sys_finit_module
+ => do_syscall_64
+ => entry_SYSCALL_64_after_hwframe
+
+In this case, it is possible to see that the thread added the highest
+contribution to the *timer latency* and the stack trace, saved during
+the timerlat IRQ handler, points to a function named
+dummy_load_1ms_pd_init, which had the following code (on purpose)::
+
+ static int __init dummy_load_1ms_pd_init(void)
+ {
+ preempt_disable();
+ mdelay(1);
+ preempt_enable();
+ return 0;
+
+ }