From 2d4c53973f014983d59a7aa4e980007db315fee0 Mon Sep 17 00:00:00 2001 From: Paran Lee Date: Wed, 15 Mar 2023 14:15:01 +0900 Subject: perf tools riscv: Add support for riscv lookup_binutils_path Add RISC-V binutils path on lookup triplets. Reviewed-by: Palmer Dabbelt Signed-off-by: Paran Lee Acked-by: Ian Rogers Acked-by: Palmer Dabbelt Cc: Albert Ou Cc: Anton Blanchard Cc: Daniel Axtens Cc: Jiri Olsa Cc: Michael Ellerman Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: linux-riscv@lists.infradead.org Link: https://lore.kernel.org/r/20230315051500.13064-1-p4ranlee@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/common.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c index b951374bc49d..4908d54dd33b 100644 --- a/tools/perf/arch/common.c +++ b/tools/perf/arch/common.c @@ -43,6 +43,20 @@ const char *const powerpc_triplets[] = { NULL }; +const char *const riscv32_triplets[] = { + "riscv32-unknown-linux-gnu-", + "riscv32-linux-android-", + "riscv32-linux-gnu-", + NULL +}; + +const char *const riscv64_triplets[] = { + "riscv64-unknown-linux-gnu-", + "riscv64-linux-android-", + "riscv64-linux-gnu-", + NULL +}; + const char *const s390_triplets[] = { "s390-ibm-linux-", "s390x-linux-gnu-", @@ -164,6 +178,10 @@ static int perf_env__lookup_binutils_path(struct perf_env *env, path_list = arm64_triplets; else if (!strcmp(arch, "powerpc")) path_list = powerpc_triplets; + else if (!strcmp(arch, "riscv32")) + path_list = riscv32_triplets; + else if (!strcmp(arch, "riscv64")) + path_list = riscv64_triplets; else if (!strcmp(arch, "sh")) path_list = sh_triplets; else if (!strcmp(arch, "s390")) -- cgit v1.2.3 From 2b433fadb1db6f64a9edf22de668118de7e287ed Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 18 Apr 2023 11:18:24 +0800 Subject: perf map: Add helper map__fprintf_dsoname_dsoff This adds a helper function map__fprintf_dsoname_dsoff() to print dsoname with optional dso offset. Suggested-by: Adrian Hunter Signed-off-by: Changbin Du Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Hui Wang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230418031825.1262579-3-changbin.du@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 13 +++++++++++++ tools/perf/util/map.h | 1 + 2 files changed, 14 insertions(+) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index b7f890950909..8c96ce6bfc51 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -452,6 +452,19 @@ size_t map__fprintf_dsoname(struct map *map, FILE *fp) return fprintf(fp, "%s", dsoname); } +size_t map__fprintf_dsoname_dsoff(struct map *map, bool print_off, u64 addr, FILE *fp) +{ + int printed = 0; + + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(map, fp); + if (print_off && map && map__dso(map) && !map__dso(map)->kernel) + printed += fprintf(fp, "+0x%" PRIx64, addr); + printed += fprintf(fp, ")"); + + return printed; +} + char *map__srcline(struct map *map, u64 addr, struct symbol *sym) { if (map == NULL) diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 823ab7fc0acf..66a87b3d9965 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -194,6 +194,7 @@ static inline void __map__zput(struct map **map) size_t map__fprintf(struct map *map, FILE *fp); size_t map__fprintf_dsoname(struct map *map, FILE *fp); +size_t map__fprintf_dsoname_dsoff(struct map *map, bool print_off, u64 addr, FILE *fp); char *map__srcline(struct map *map, u64 addr, struct symbol *sym); int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix, FILE *fp); -- cgit v1.2.3 From af9eb56bfed273a85b8c3f99d3ed7ff979c36ae0 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 18 Apr 2023 11:18:25 +0800 Subject: perf script: Add new output field 'dsoff' to print dso offset This adds a new 'dsoff' field to print dso offset for resolved symbols, and the offset is appended to dso name. Default output: $ perf script ls 2695501 3011030.487017: 500000 cycles: 152cc73ef4b5 get_common_indices.constprop.0+0x155 (/usr/lib/x86_64-linux-gnu/ld-2.31.so) ls 2695501 3011030.487018: 500000 cycles: ffffffff99045b3e [unknown] ([unknown]) ls 2695501 3011030.487018: 500000 cycles: ffffffff9968e107 [unknown] ([unknown]) ls 2695501 3011030.487018: 500000 cycles: ffffffffc1f54afb [unknown] ([unknown]) ls 2695501 3011030.487018: 500000 cycles: ffffffff9968382f [unknown] ([unknown]) ls 2695501 3011030.487019: 500000 cycles: ffffffff99e00094 [unknown] ([unknown]) ls 2695501 3011030.487019: 500000 cycles: 152cc718a8d0 __errno_location@plt+0x0 (/usr/lib/x86_64-linux-gnu/libselinux.so.1) Display 'dsoff' field: $ perf script -F +dsoff ls 2695501 3011030.487017: 500000 cycles: 152cc73ef4b5 get_common_indices.constprop.0+0x155 (/usr/lib/x86_64-linux-gnu/ld-2.31.so+0x1c4b5) ls 2695501 3011030.487018: 500000 cycles: ffffffff99045b3e [unknown] ([unknown]) ls 2695501 3011030.487018: 500000 cycles: ffffffff9968e107 [unknown] ([unknown]) ls 2695501 3011030.487018: 500000 cycles: ffffffffc1f54afb [unknown] ([unknown]) ls 2695501 3011030.487018: 500000 cycles: ffffffff9968382f [unknown] ([unknown]) ls 2695501 3011030.487019: 500000 cycles: ffffffff99e00094 [unknown] ([unknown]) ls 2695501 3011030.487019: 500000 cycles: 152cc718a8d0 __errno_location@plt+0x0 (/usr/lib/x86_64-linux-gnu/libselinux.so.1+0x68d0) ls 2695501 3011030.487019: 500000 cycles: ffffffff992a6db0 [unknown] ([unknown]) Signed-off-by: Changbin Du Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Hui Wang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230418031825.1262579-4-changbin.du@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 2 +- tools/perf/builtin-script.c | 60 +++++++++++++------------------- tools/perf/util/evsel_fprintf.c | 16 ++++----- tools/perf/util/evsel_fprintf.h | 1 + 4 files changed, 32 insertions(+), 47 deletions(-) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 777a0d8ba7d1..ff9a52e44688 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -130,7 +130,7 @@ OPTIONS -F:: --fields:: Comma separated list of fields to print. Options are: - comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, + comm, tid, pid, time, cpu, event, trace, ip, sym, dso, dsoff, addr, symoff, srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackinsnlen, brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size, code_page_size, ins_lat, diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index c57be48d65bb..029d5a597233 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -133,6 +133,7 @@ enum perf_output_field { PERF_OUTPUT_VCPU = 1ULL << 38, PERF_OUTPUT_CGROUP = 1ULL << 39, PERF_OUTPUT_RETIRE_LAT = 1ULL << 40, + PERF_OUTPUT_DSOFF = 1ULL << 41, }; struct perf_script { @@ -174,6 +175,7 @@ struct output_option { {.str = "ip", .field = PERF_OUTPUT_IP}, {.str = "sym", .field = PERF_OUTPUT_SYM}, {.str = "dso", .field = PERF_OUTPUT_DSO}, + {.str = "dsoff", .field = PERF_OUTPUT_DSOFF}, {.str = "addr", .field = PERF_OUTPUT_ADDR}, {.str = "symoff", .field = PERF_OUTPUT_SYMOFFSET}, {.str = "srcline", .field = PERF_OUTPUT_SRCLINE}, @@ -574,6 +576,9 @@ static void set_print_ip_opts(struct perf_event_attr *attr) if (PRINT_FIELD(DSO)) output[type].print_ip_opts |= EVSEL__PRINT_DSO; + if (PRINT_FIELD(DSOFF)) + output[type].print_ip_opts |= EVSEL__PRINT_DSOFF; + if (PRINT_FIELD(SYMOFFSET)) output[type].print_ip_opts |= EVSEL__PRINT_SYMOFFSET; @@ -627,6 +632,10 @@ static int perf_session__check_output_opt(struct perf_session *session) if (evsel == NULL) continue; + /* 'dsoff' implys 'dso' field */ + if (output[j].fields & PERF_OUTPUT_DSOFF) + output[j].fields |= PERF_OUTPUT_DSO; + set_print_ip_opts(&evsel->core.attr); tod |= output[j].fields & PERF_OUTPUT_TOD; } @@ -929,18 +938,12 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, } printed += fprintf(fp, " 0x%"PRIx64, from); - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, "("); - printed += map__fprintf_dsoname(alf.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(alf.map, PRINT_FIELD(DSOFF), alf.addr, fp); printed += fprintf(fp, "/0x%"PRIx64, to); - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, "("); - printed += map__fprintf_dsoname(alt.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(alt.map, PRINT_FIELD(DSOFF), alt.addr, fp); printed += print_bstack_flags(fp, entries + i); } @@ -972,18 +975,12 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, thread__find_symbol_fb(thread, sample->cpumode, to, &alt); printed += symbol__fprintf_symname_offs(alf.sym, &alf, fp); - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, "("); - printed += map__fprintf_dsoname(alf.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(alf.map, PRINT_FIELD(DSOFF), alf.addr, fp); printed += fprintf(fp, "%c", '/'); printed += symbol__fprintf_symname_offs(alt.sym, &alt, fp); - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, "("); - printed += map__fprintf_dsoname(alt.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(alt.map, PRINT_FIELD(DSOFF), alt.addr, fp); printed += print_bstack_flags(fp, entries + i); } @@ -1019,17 +1016,11 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, to = map__dso_map_ip(alt.map, to); printed += fprintf(fp, " 0x%"PRIx64, from); - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, "("); - printed += map__fprintf_dsoname(alf.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(alf.map, PRINT_FIELD(DSOFF), alf.addr, fp); printed += fprintf(fp, "/0x%"PRIx64, to); - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, "("); - printed += map__fprintf_dsoname(alt.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(alt.map, PRINT_FIELD(DSOFF), alt.addr, fp); printed += print_bstack_flags(fp, entries + i); } @@ -1393,11 +1384,8 @@ static int perf_sample__fprintf_addr(struct perf_sample *sample, printed += symbol__fprintf_symname(al.sym, fp); } - if (PRINT_FIELD(DSO)) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(al.map, fp); - printed += fprintf(fp, ")"); - } + if (PRINT_FIELD(DSO)) + printed += map__fprintf_dsoname_dsoff(al.map, PRINT_FIELD(DSOFF), al.addr, fp); out: return printed; } @@ -3883,7 +3871,7 @@ int cmd_script(int argc, const char **argv) "comma separated output fields prepend with 'type:'. " "+field to add and -field to remove." "Valid types: hw,sw,trace,raw,synth. " - "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," + "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,dsoff" "addr,symoff,srcline,period,iregs,uregs,brstack," "brstacksym,flags,data_src,weight,bpf-output,brstackinsn," "brstackinsnlen,brstackoff,callindent,insn,insnlen,synth," diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index cc80ec554c0a..79e42d66f55b 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -116,6 +116,7 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, int print_ip = print_opts & EVSEL__PRINT_IP; int print_sym = print_opts & EVSEL__PRINT_SYM; int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_dsoff = print_opts & EVSEL__PRINT_DSOFF; int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; int print_oneline = print_opts & EVSEL__PRINT_ONELINE; int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; @@ -171,11 +172,8 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, } } - if (print_dso && (!sym || !sym->inlined)) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(map, fp); - printed += fprintf(fp, ")"); - } + if (print_dso && (!sym || !sym->inlined)) + printed += map__fprintf_dsoname_dsoff(map, print_dsoff, addr, fp); if (print_srcline) printed += map__fprintf_srcline(map, addr, "\n ", fp); @@ -209,6 +207,7 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, int print_ip = print_opts & EVSEL__PRINT_IP; int print_sym = print_opts & EVSEL__PRINT_SYM; int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_dsoff = print_opts & EVSEL__PRINT_DSOFF; int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; @@ -234,11 +233,8 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, } } - if (print_dso) { - printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(al->map, fp); - printed += fprintf(fp, ")"); - } + if (print_dso) + printed += map__fprintf_dsoname_dsoff(al->map, print_dsoff, al->addr, fp); if (print_srcline) printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); diff --git a/tools/perf/util/evsel_fprintf.h b/tools/perf/util/evsel_fprintf.h index 3093d096c29f..c8a9fac2f2dd 100644 --- a/tools/perf/util/evsel_fprintf.h +++ b/tools/perf/util/evsel_fprintf.h @@ -26,6 +26,7 @@ int evsel__fprintf(struct evsel *evsel, struct perf_attr_details *details, FILE #define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6) #define EVSEL__PRINT_CALLCHAIN_ARROW (1<<7) #define EVSEL__PRINT_SKIP_IGNORED (1<<8) +#define EVSEL__PRINT_DSOFF (1<<9) struct addr_location; struct perf_event_attr; -- cgit v1.2.3 From 24f0af6d038af461c97433cd80b688eb0346a466 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 24 Apr 2023 08:51:06 +0300 Subject: perf dso: Declare dso const as needed Declare dso const, so that functions can be called with const struct *dso. Signed-off-by: Adrian Hunter Cc: Changbin Du Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20230424055107.12105-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 0b7c7633b9f6..dfc4cf3de7a8 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -379,19 +379,19 @@ void dso__reset_find_symbol_cache(struct dso *dso); size_t dso__fprintf_symbols_by_name(struct dso *dso, FILE *fp); size_t dso__fprintf(struct dso *dso, FILE *fp); -static inline bool dso__is_vmlinux(struct dso *dso) +static inline bool dso__is_vmlinux(const struct dso *dso) { return dso->binary_type == DSO_BINARY_TYPE__VMLINUX || dso->binary_type == DSO_BINARY_TYPE__GUEST_VMLINUX; } -static inline bool dso__is_kcore(struct dso *dso) +static inline bool dso__is_kcore(const struct dso *dso) { return dso->binary_type == DSO_BINARY_TYPE__KCORE || dso->binary_type == DSO_BINARY_TYPE__GUEST_KCORE; } -static inline bool dso__is_kallsyms(struct dso *dso) +static inline bool dso__is_kallsyms(const struct dso *dso) { return dso->kernel && dso->long_name[0] != '/'; } -- cgit v1.2.3 From d3b52f71d18513c209465ba65e0c524b9733351b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 24 Apr 2023 08:51:07 +0300 Subject: perf script: Refine printing of dso offset (dsoff) Print dso offset only for object files, and in those cases force using the dso->long_name if the dso->name starts with '[' or the dso is kcore, in order to avoid special names such as [vdso], or mixing up kcore with vmlinux. Signed-off-by: Adrian Hunter Cc: Changbin Du Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20230424055107.12105-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/dso.h | 2 ++ tools/perf/util/map.c | 23 +++++++++++++++++++---- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index a86614599269..046fbfcfdaab 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -67,6 +67,39 @@ char dso__symtab_origin(const struct dso *dso) return origin[dso->symtab_type]; } +bool dso__is_object_file(const struct dso *dso) +{ + switch (dso->binary_type) { + case DSO_BINARY_TYPE__KALLSYMS: + case DSO_BINARY_TYPE__GUEST_KALLSYMS: + case DSO_BINARY_TYPE__JAVA_JIT: + case DSO_BINARY_TYPE__BPF_PROG_INFO: + case DSO_BINARY_TYPE__BPF_IMAGE: + case DSO_BINARY_TYPE__OOL: + return false; + case DSO_BINARY_TYPE__VMLINUX: + case DSO_BINARY_TYPE__GUEST_VMLINUX: + case DSO_BINARY_TYPE__DEBUGLINK: + case DSO_BINARY_TYPE__BUILD_ID_CACHE: + case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO: + case DSO_BINARY_TYPE__FEDORA_DEBUGINFO: + case DSO_BINARY_TYPE__UBUNTU_DEBUGINFO: + case DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO: + case DSO_BINARY_TYPE__BUILDID_DEBUGINFO: + case DSO_BINARY_TYPE__SYSTEM_PATH_DSO: + case DSO_BINARY_TYPE__GUEST_KMODULE: + case DSO_BINARY_TYPE__GUEST_KMODULE_COMP: + case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE: + case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP: + case DSO_BINARY_TYPE__KCORE: + case DSO_BINARY_TYPE__GUEST_KCORE: + case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO: + case DSO_BINARY_TYPE__NOT_FOUND: + default: + return true; + } +} + int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type, char *root_dir, char *filename, size_t size) diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index dfc4cf3de7a8..b23a157c914d 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -396,6 +396,8 @@ static inline bool dso__is_kallsyms(const struct dso *dso) return dso->kernel && dso->long_name[0] != '/'; } +bool dso__is_object_file(const struct dso *dso); + void dso__free_a2l(struct dso *dso); enum dso_type dso__type(struct dso *dso, struct machine *machine); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 8c96ce6bfc51..4d9944bbf5e4 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -431,14 +431,21 @@ size_t map__fprintf(struct map *map, FILE *fp) map__start(map), map__end(map), map__pgoff(map), dso->name); } -size_t map__fprintf_dsoname(struct map *map, FILE *fp) +static bool prefer_dso_long_name(const struct dso *dso, bool print_off) +{ + return dso->long_name && + (symbol_conf.show_kernel_path || + (print_off && (dso->name[0] == '[' || dso__is_kcore(dso)))); +} + +static size_t __map__fprintf_dsoname(struct map *map, bool print_off, FILE *fp) { char buf[symbol_conf.pad_output_len_dso + 1]; const char *dsoname = "[unknown]"; const struct dso *dso = map ? map__dso(map) : NULL; if (dso) { - if (symbol_conf.show_kernel_path && dso->long_name) + if (prefer_dso_long_name(dso, print_off)) dsoname = dso->long_name; else dsoname = dso->name; @@ -452,13 +459,21 @@ size_t map__fprintf_dsoname(struct map *map, FILE *fp) return fprintf(fp, "%s", dsoname); } +size_t map__fprintf_dsoname(struct map *map, FILE *fp) +{ + return __map__fprintf_dsoname(map, false, fp); +} + size_t map__fprintf_dsoname_dsoff(struct map *map, bool print_off, u64 addr, FILE *fp) { + const struct dso *dso = map ? map__dso(map) : NULL; int printed = 0; + if (print_off && (!dso || !dso__is_object_file(dso))) + print_off = false; printed += fprintf(fp, " ("); - printed += map__fprintf_dsoname(map, fp); - if (print_off && map && map__dso(map) && !map__dso(map)->kernel) + printed += __map__fprintf_dsoname(map, print_off, fp); + if (print_off) printed += fprintf(fp, "+0x%" PRIx64, addr); printed += fprintf(fp, ")"); -- cgit v1.2.3 From 2a61d97fb0ff17411aeb808c145209dbcfc34506 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:12 -0700 Subject: perf vendor events intel: Add alderlake metric constraints Previously these constraints were disabled as they contained topdown events. Since: https://lore.kernel.org/all/20230312021543.3060328-9-irogers@google.com/ the topdown events are correctly grouped even if no group exists. This change was created by PR: https://github.com/intel/perfmon/pull/71 Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index 1f9047553942..4c2a14ea5a1c 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -1077,6 +1077,7 @@ }, { "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fp_arith", @@ -1203,6 +1204,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_branch_misprediction_cost", @@ -1255,6 +1257,7 @@ }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", "MetricName": "tma_info_core_bound_likely", @@ -1315,6 +1318,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", "MetricGroup": "DSBmiss;Fed;tma_issueFB", "MetricName": "tma_info_dsb_misses", @@ -1408,6 +1412,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_instruction_fetch_bw", @@ -1827,6 +1832,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_memory_data_tlbs", @@ -1836,6 +1842,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", "MetricName": "tma_info_memory_latency", @@ -1845,6 +1852,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_mispredictions", @@ -1877,6 +1885,7 @@ }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_retiring * tma_info_slots / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_retire", @@ -2152,6 +2161,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", @@ -2231,6 +2241,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", -- cgit v1.2.3 From aea8abd7d435262f1db443b163147545098d517f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:13 -0700 Subject: perf vendor events intel: Add icelake metric constraints Previously these constraints were disabled as they contained topdown events. Since: https://lore.kernel.org/all/20230312021543.3060328-9-irogers@google.com/ the topdown events are correctly grouped even if no group exists. This change was created by PR: https://github.com/intel/perfmon/pull/71 Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json index 1a2154f28b7b..ae8a96ec7fa5 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json @@ -317,6 +317,7 @@ }, { "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fp_arith", @@ -421,6 +422,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_branch_misprediction_cost", @@ -466,6 +468,7 @@ }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", "MetricName": "tma_info_core_bound_likely", @@ -518,6 +521,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", "MetricGroup": "DSBmiss;Fed;tma_issueFB", "MetricName": "tma_info_dsb_misses", @@ -599,6 +603,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_instruction_fetch_bw", @@ -937,6 +942,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_memory_data_tlbs", @@ -945,6 +951,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", "MetricName": "tma_info_memory_latency", @@ -953,6 +960,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_mispredictions", @@ -1004,6 +1012,7 @@ }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_retire" @@ -1207,6 +1216,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", @@ -1277,6 +1287,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", -- cgit v1.2.3 From f215040aa24534aa8d4c4bf657387f7252e64370 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:14 -0700 Subject: perf vendor events intel: Add icelakex metric constraints Previously these constraints were disabled as they contained topdown events. Since: https://lore.kernel.org/all/20230312021543.3060328-9-irogers@google.com/ the topdown events are correctly grouped even if no group exists. This change was created by PR: https://github.com/intel/perfmon/pull/71 Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index 1ef772b40e04..b736fec164d0 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -282,6 +282,7 @@ }, { "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fp_arith", @@ -386,6 +387,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_branch_misprediction_cost", @@ -431,6 +433,7 @@ }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", "MetricName": "tma_info_core_bound_likely", @@ -483,6 +486,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", "MetricGroup": "DSBmiss;Fed;tma_issueFB", "MetricName": "tma_info_dsb_misses", @@ -564,6 +568,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_instruction_fetch_bw", @@ -948,6 +953,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_memory_data_tlbs", @@ -956,6 +962,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", "MetricName": "tma_info_memory_latency", @@ -964,6 +971,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_mispredictions", @@ -1027,6 +1035,7 @@ }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_retire" @@ -1230,6 +1239,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", @@ -1300,6 +1310,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", -- cgit v1.2.3 From cbd393afa3ff6ef951e40c3bfbe7538aaf72c9aa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:15 -0700 Subject: perf vendor events intel: Add sapphirerapids metric constraints Previously these constraints were disabled as they contained topdown events. Since: https://lore.kernel.org/all/20230312021543.3060328-9-irogers@google.com/ the topdown events are correctly grouped even if no group exists. This change was created by PR: https://github.com/intel/perfmon/pull/71 Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 620fc5bd2217..4308e2483112 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -290,6 +290,7 @@ }, { "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector + tma_fp_amx", "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fp_arith", @@ -412,6 +413,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_branch_misprediction_cost", @@ -457,6 +459,7 @@ }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", "MetricName": "tma_info_core_bound_likely", @@ -509,6 +512,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", "MetricGroup": "DSBmiss;Fed;tma_issueFB", "MetricName": "tma_info_dsb_misses", @@ -590,6 +594,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_instruction_fetch_bw", @@ -998,6 +1003,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_memory_data_tlbs", @@ -1006,6 +1012,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", "MetricName": "tma_info_memory_latency", @@ -1014,6 +1021,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_mispredictions", @@ -1054,6 +1062,7 @@ }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_retire" @@ -1328,6 +1337,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", @@ -1399,6 +1409,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", -- cgit v1.2.3 From cde61c605252d3ee2eba3cc966f4871819108955 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:16 -0700 Subject: perf vendor events intel: Add tigerlake metric constraints Previously these constraints were disabled as they contained topdown events. Since: https://lore.kernel.org/all/20230312021543.3060328-9-irogers@google.com/ the topdown events are correctly grouped even if no group exists. This change was created by PR: https://github.com/intel/perfmon/pull/71 Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json index b442ed4acfbb..ae62bacf9f5e 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json @@ -311,6 +311,7 @@ }, { "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fp_arith", @@ -415,6 +416,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_branch_misprediction_cost", @@ -460,6 +462,7 @@ }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", "MetricName": "tma_info_core_bound_likely", @@ -512,6 +515,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", "MetricGroup": "DSBmiss;Fed;tma_issueFB", "MetricName": "tma_info_dsb_misses", @@ -593,6 +597,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", "MetricGroup": "Fed;FetchBW;Frontend", "MetricName": "tma_info_instruction_fetch_bw", @@ -957,6 +962,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_info_memory_data_tlbs", @@ -965,6 +971,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", "MetricName": "tma_info_memory_latency", @@ -973,6 +980,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", "MetricName": "tma_info_mispredictions", @@ -1024,6 +1032,7 @@ }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", "MetricName": "tma_info_retire" @@ -1221,6 +1230,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_light_operations * MEM_INST_RETIRED.ANY / INST_RETIRED.ANY", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", @@ -1291,6 +1301,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", -- cgit v1.2.3 From 5a52817e388bc2beaceff7b2059988987491cb59 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:17 -0700 Subject: perf test: Test more sysfs events Parse events for all PMUs, and not just cpu, in test "Parsing of all PMU events from sysfs". Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 129 ++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 58 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 8068cfd89b84..3721a2182f45 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -7,6 +7,7 @@ #include "debug.h" #include "pmu.h" #include "pmu-hybrid.h" +#include "pmus.h" #include #include #include "fncache.h" @@ -558,7 +559,8 @@ static int test__checkevent_pmu_events(struct evlist *evlist) struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type || + strcmp(evsel->pmu_name, "cpu")); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", @@ -590,7 +592,8 @@ static int test__checkevent_pmu_events_mix(struct evlist *evlist) /* cpu/pmu-event/u*/ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type || + strcmp(evsel->pmu_name, "cpu")); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", @@ -2225,74 +2228,84 @@ static int test_pmu(void) static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - struct stat st; - char path[PATH_MAX]; - struct dirent *ent; - DIR *dir; - int ret; - - if (!test_pmu()) - return TEST_SKIP; + struct perf_pmu *pmu; + int ret = TEST_OK; - snprintf(path, PATH_MAX, "%s/bus/event_source/devices/cpu/events/", - sysfs__mountpoint()); + if (list_empty(&pmus)) + perf_pmu__scan(NULL); - ret = stat(path, &st); - if (ret) { - pr_debug("omitting PMU cpu events tests: %s\n", path); - return TEST_OK; - } + perf_pmus__for_each_pmu(pmu) { + struct stat st; + char path[PATH_MAX]; + struct dirent *ent; + DIR *dir; + int err; - dir = opendir(path); - if (!dir) { - pr_debug("can't open pmu event dir: %s\n", path); - return TEST_FAIL; - } + snprintf(path, PATH_MAX, "%s/bus/event_source/devices/%s/events/", + sysfs__mountpoint(), pmu->name); - ret = TEST_OK; - while ((ent = readdir(dir))) { - struct evlist_test e = { .name = NULL, }; - char name[2 * NAME_MAX + 1 + 12 + 3]; - int test_ret; + err = stat(path, &st); + if (err) { + pr_debug("skipping PMU %s events tests: %s\n", pmu->name, path); + continue; + } - /* Names containing . are special and cannot be used directly */ - if (strchr(ent->d_name, '.')) + dir = opendir(path); + if (!dir) { + pr_debug("can't open pmu event dir: %s\n", path); + ret = combine_test_results(ret, TEST_SKIP); continue; + } - snprintf(name, sizeof(name), "cpu/event=%s/u", ent->d_name); + while ((ent = readdir(dir))) { + struct evlist_test e = { .name = NULL, }; + char name[2 * NAME_MAX + 1 + 12 + 3]; + int test_ret; - e.name = name; - e.check = test__checkevent_pmu_events; + /* Names containing . are special and cannot be used directly */ + if (strchr(ent->d_name, '.')) + continue; - test_ret = test_event(&e); - if (test_ret != TEST_OK) { - pr_debug("Test PMU event failed for '%s'", name); - ret = combine_test_results(ret, test_ret); - } - /* - * Names containing '-' are recognized as prefixes and suffixes - * due to '-' being a legacy PMU separator. This fails when the - * prefix or suffix collides with an existing legacy token. For - * example, branch-brs has a prefix (branch) that collides with - * a PE_NAME_CACHE_TYPE token causing a parse error as a suffix - * isn't expected after this. As event names in the config - * slashes are allowed a '-' in the name we check this works - * above. - */ - if (strchr(ent->d_name, '-')) - continue; + snprintf(name, sizeof(name), "%s/event=%s/u", pmu->name, ent->d_name); - snprintf(name, sizeof(name), "%s:u,cpu/event=%s/u", ent->d_name, ent->d_name); - e.name = name; - e.check = test__checkevent_pmu_events_mix; - test_ret = test_event(&e); - if (test_ret != TEST_OK) { - pr_debug("Test PMU event failed for '%s'", name); - ret = combine_test_results(ret, test_ret); + e.name = name; + e.check = test__checkevent_pmu_events; + + test_ret = test_event(&e); + if (test_ret != TEST_OK) { + pr_debug("Test PMU event failed for '%s'", name); + ret = combine_test_results(ret, test_ret); + } + + if (!is_pmu_core(pmu->name)) + continue; + + /* + * Names containing '-' are recognized as prefixes and suffixes + * due to '-' being a legacy PMU separator. This fails when the + * prefix or suffix collides with an existing legacy token. For + * example, branch-brs has a prefix (branch) that collides with + * a PE_NAME_CACHE_TYPE token causing a parse error as a suffix + * isn't expected after this. As event names in the config + * slashes are allowed a '-' in the name we check this works + * above. + */ + if (strchr(ent->d_name, '-')) + continue; + + snprintf(name, sizeof(name), "%s:u,%s/event=%s/u", + ent->d_name, pmu->name, ent->d_name); + e.name = name; + e.check = test__checkevent_pmu_events_mix; + test_ret = test_event(&e); + if (test_ret != TEST_OK) { + pr_debug("Test PMU event failed for '%s'", name); + ret = combine_test_results(ret, test_ret); + } } - } - closedir(dir); + closedir(dir); + } return ret; } -- cgit v1.2.3 From 8f8c106886983f7963df76d33bf9e42df8ec3e8a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:18 -0700 Subject: perf test: Use valid for PMU tests Rather than skip all tests in test__events_pmu if PMU cpu isn't present, use the per-test valid test. This allows the running of software PMU tests on hybrid and arm systems. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 3721a2182f45..c06fa7653ac2 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1432,6 +1432,11 @@ static int test__checkevent_config_cache(struct evlist *evlist) return TEST_OK; } +static bool test__pmu_cpu_valid(void) +{ + return !!perf_pmu__find("cpu"); +} + static bool test__intel_pt_valid(void) { return !!perf_pmu__find("intel_pt"); @@ -1981,21 +1986,25 @@ static const struct evlist_test test__events[] = { static const struct evlist_test test__events_pmu[] = { { .name = "cpu/config=10,config1,config2=3,period=1000/u", + .valid = test__pmu_cpu_valid, .check = test__checkevent_pmu, /* 0 */ }, { .name = "cpu/config=1,name=krava/u,cpu/config=2/u", + .valid = test__pmu_cpu_valid, .check = test__checkevent_pmu_name, /* 1 */ }, { .name = "cpu/config=1,call-graph=fp,time,period=100000/,cpu/config=2,call-graph=no,time=0,period=2000/", + .valid = test__pmu_cpu_valid, .check = test__checkevent_pmu_partial_time_callgraph, /* 2 */ }, { .name = "cpu/name='COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks',period=0x1,event=0x2/ukp", + .valid = test__pmu_cpu_valid, .check = test__checkevent_complex_name, /* 3 */ }, @@ -2211,21 +2220,6 @@ static int test__terms2(struct test_suite *test __maybe_unused, int subtest __ma return test_terms(test__terms, ARRAY_SIZE(test__terms)); } -static int test_pmu(void) -{ - struct stat st; - char path[PATH_MAX]; - int ret; - - snprintf(path, PATH_MAX, "%s/bus/event_source/devices/cpu/format/", - sysfs__mountpoint()); - - ret = stat(path, &st); - if (ret) - pr_debug("omitting PMU cpu tests\n"); - return !ret; -} - static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { struct perf_pmu *pmu; @@ -2311,9 +2305,6 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest static int test__pmu_events2(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - if (!test_pmu()) - return TEST_SKIP; - return test_events(test__events_pmu, ARRAY_SIZE(test__events_pmu)); } -- cgit v1.2.3 From 9854934b055cfc37b3acdff5323dd2060745a774 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:19 -0700 Subject: perf test: Mask configs with extended types then test Add helper to test the config of an evsel. Dependent on the type of the evsel, mask the config so that high-bits containing the extended PMU type are ignored. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 197 ++++++++++++++++++---------------------- 1 file changed, 88 insertions(+), 109 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index c06fa7653ac2..4b00cb4aa73a 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -21,6 +21,21 @@ #define PERF_TP_SAMPLE_TYPE (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | \ PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD) +static bool test_config(const struct evsel *evsel, __u64 expected_config) +{ + __u32 type = evsel->core.attr.type; + __u64 config = evsel->core.attr.config; + + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { + /* + * HARDWARE and HW_CACHE events encode the PMU's extended type + * in the top 32-bits. Mask in order to ignore. + */ + config &= PERF_HW_EVENT_MASK; + } + return config == expected_config; +} + #ifdef HAVE_LIBTRACEEVENT #if defined(__s390x__) @@ -87,7 +102,7 @@ static int test__checkevent_raw(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); return TEST_OK; } @@ -97,7 +112,7 @@ static int test__checkevent_numeric(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); return TEST_OK; } @@ -107,8 +122,7 @@ static int test__checkevent_symbolic_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); return TEST_OK; } @@ -118,8 +132,7 @@ static int test__checkevent_symbolic_name_config(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); /* * The period value gets configured within evlist__config, * while this test executes only parse events method. @@ -139,8 +152,7 @@ static int test__checkevent_symbolic_alias(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_SW_PAGE_FAULTS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_SW_PAGE_FAULTS)); return TEST_OK; } @@ -150,7 +162,7 @@ static int test__checkevent_genhw(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", (1 << 16) == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1 << 16)); return TEST_OK; } @@ -160,7 +172,7 @@ static int test__checkevent_breakpoint(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", (HW_BREAKPOINT_R | HW_BREAKPOINT_W) == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", HW_BREAKPOINT_LEN_4 == @@ -174,7 +186,7 @@ static int test__checkevent_breakpoint_x(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", HW_BREAKPOINT_X == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", sizeof(long) == evsel->core.attr.bp_len); @@ -188,7 +200,7 @@ static int test__checkevent_breakpoint_r(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", HW_BREAKPOINT_R == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", @@ -203,7 +215,7 @@ static int test__checkevent_breakpoint_w(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", HW_BREAKPOINT_W == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", @@ -218,7 +230,7 @@ static int test__checkevent_breakpoint_rw(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", (HW_BREAKPOINT_R|HW_BREAKPOINT_W) == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", @@ -447,7 +459,7 @@ static int test__checkevent_pmu(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 10 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 10)); TEST_ASSERT_VAL("wrong config1", 1 == evsel->core.attr.config1); TEST_ASSERT_VAL("wrong config2", 3 == evsel->core.attr.config2); TEST_ASSERT_VAL("wrong config3", 0 == evsel->core.attr.config3); @@ -469,7 +481,7 @@ static int test__checkevent_list(struct evlist *evlist) /* r1 */ TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); TEST_ASSERT_VAL("wrong config1", 0 == evsel->core.attr.config1); TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2); TEST_ASSERT_VAL("wrong config3", 0 == evsel->core.attr.config3); @@ -492,7 +504,7 @@ static int test__checkevent_list(struct evlist *evlist) /* 1:1:hp */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -509,14 +521,14 @@ static int test__checkevent_pmu_name(struct evlist *evlist) /* cpu/config=1,name=krava/u */ TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "krava")); /* cpu/config=2/u" */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 2 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 2)); TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "cpu/config=2/u")); @@ -530,7 +542,7 @@ static int test__checkevent_pmu_partial_time_callgraph(struct evlist *evlist) /* cpu/config=1,call-graph=fp,time,period=100000/ */ TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 1 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); /* * The period, time and callgraph value gets configured within evlist__config, * while this test executes only parse events method. @@ -542,7 +554,7 @@ static int test__checkevent_pmu_partial_time_callgraph(struct evlist *evlist) /* cpu/config=2,call-graph=no,time=0,period=2000/ */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 2 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 2)); /* * The period, time and callgraph value gets configured within evlist__config, * while this test executes only parse events method. @@ -696,8 +708,7 @@ static int test__group1(struct evlist *evlist) /* instructions:k */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -712,8 +723,7 @@ static int test__group1(struct evlist *evlist) /* cycles:upp */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -738,8 +748,7 @@ static int test__group2(struct evlist *evlist) /* faults + :ku modifier */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_SW_PAGE_FAULTS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_SW_PAGE_FAULTS)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -754,8 +763,7 @@ static int test__group2(struct evlist *evlist) /* cache-references + :u modifier */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_REFERENCES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_REFERENCES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -769,8 +777,7 @@ static int test__group2(struct evlist *evlist) /* cycles:k */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -813,8 +820,7 @@ static int test__group3(struct evlist *evlist __maybe_unused) /* group1 cycles:kppp */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -830,8 +836,7 @@ static int test__group3(struct evlist *evlist __maybe_unused) /* group2 cycles + G modifier */ evsel = leader = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -848,7 +853,7 @@ static int test__group3(struct evlist *evlist __maybe_unused) /* group2 1:3 + G modifier */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 3 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 3)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -862,8 +867,7 @@ static int test__group3(struct evlist *evlist __maybe_unused) /* instructions:u */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -887,8 +891,7 @@ static int test__group4(struct evlist *evlist __maybe_unused) /* cycles:u + p */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -905,8 +908,7 @@ static int test__group4(struct evlist *evlist __maybe_unused) /* instructions:kp + p */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -931,8 +933,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* cycles + G */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -948,8 +949,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* instructions + G */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -963,8 +963,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* cycles:G */ evsel = leader = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -980,8 +979,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* instructions:G */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -994,8 +992,7 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* cycles */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1017,8 +1014,7 @@ static int test__group_gh1(struct evlist *evlist) /* cycles + :H group modifier */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1033,8 +1029,7 @@ static int test__group_gh1(struct evlist *evlist) /* cache-misses:G + :H group modifier */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1057,8 +1052,7 @@ static int test__group_gh2(struct evlist *evlist) /* cycles + :G group modifier */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1073,8 +1067,7 @@ static int test__group_gh2(struct evlist *evlist) /* cache-misses:H + :G group modifier */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1097,8 +1090,7 @@ static int test__group_gh3(struct evlist *evlist) /* cycles:G + :u group modifier */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1113,8 +1105,7 @@ static int test__group_gh3(struct evlist *evlist) /* cache-misses:H + :u group modifier */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1137,8 +1128,7 @@ static int test__group_gh4(struct evlist *evlist) /* cycles:G + :uG group modifier */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1153,8 +1143,7 @@ static int test__group_gh4(struct evlist *evlist) /* cache-misses:H + :uG group modifier */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1176,8 +1165,7 @@ static int test__leader_sample1(struct evlist *evlist) /* cycles - sampling group leader */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1191,8 +1179,7 @@ static int test__leader_sample1(struct evlist *evlist) /* cache-misses - not sampling */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1205,8 +1192,7 @@ static int test__leader_sample1(struct evlist *evlist) /* branch-misses - not sampling */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1229,8 +1215,7 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) /* instructions - sampling group leader */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1244,8 +1229,7 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) /* branch-misses - not sampling */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1281,8 +1265,7 @@ static int test__pinned_group(struct evlist *evlist) /* cycles - group leader */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong pinned", evsel->core.attr.pinned); @@ -1290,14 +1273,12 @@ static int test__pinned_group(struct evlist *evlist) /* cache-misses - can not be pinned, but will go on with the leader */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); /* branch-misses - ditto */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); return TEST_OK; @@ -1325,8 +1306,7 @@ static int test__exclusive_group(struct evlist *evlist) /* cycles - group leader */ evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclusive", evsel->core.attr.exclusive); @@ -1334,14 +1314,12 @@ static int test__exclusive_group(struct evlist *evlist) /* cache-misses - can not be pinned, but will go on with the leader */ evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_CACHE_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); /* branch-misses - ditto */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_HW_BRANCH_MISSES == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); return TEST_OK; @@ -1352,7 +1330,7 @@ static int test__checkevent_breakpoint_len(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", (HW_BREAKPOINT_R | HW_BREAKPOINT_W) == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", HW_BREAKPOINT_LEN_1 == @@ -1367,7 +1345,7 @@ static int test__checkevent_breakpoint_len_w(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", HW_BREAKPOINT_W == evsel->core.attr.bp_type); TEST_ASSERT_VAL("wrong bp_len", HW_BREAKPOINT_LEN_2 == @@ -1395,8 +1373,7 @@ static int test__checkevent_precise_max_modifier(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", - PERF_COUNT_SW_TASK_CLOCK == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_SW_TASK_CLOCK)); return TEST_OK; } @@ -1454,7 +1431,9 @@ static int test__checkevent_complex_name(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); - TEST_ASSERT_VAL("wrong complex name parsing", evsel__name_is(evsel, "COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks")); + TEST_ASSERT_VAL("wrong complex name parsing", + evsel__name_is(evsel, + "COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks")); return TEST_OK; } @@ -1464,7 +1443,7 @@ static int test__checkevent_raw_pmu(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); return TEST_OK; } @@ -1473,7 +1452,7 @@ static int test__sym_event_slash(struct evlist *evlist) struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); - TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); return TEST_OK; } @@ -1483,7 +1462,7 @@ static int test__sym_event_dc(struct evlist *evlist) struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); - TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); return TEST_OK; } @@ -1550,7 +1529,7 @@ static int test__hybrid_hw_event_with_pmu(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); return TEST_OK; } @@ -1561,12 +1540,12 @@ static int test__hybrid_hw_group_event(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; } @@ -1582,7 +1561,7 @@ static int test__hybrid_sw_hw_group_event(struct evlist *evlist) evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; } @@ -1594,7 +1573,7 @@ static int test__hybrid_hw_sw_group_event(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); evsel = evsel__next(evsel); @@ -1610,14 +1589,14 @@ static int test__hybrid_group_modifier1(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -1631,17 +1610,17 @@ static int test__hybrid_raw1(struct evlist *evlist) if (!perf_pmu__hybrid_mounted("cpu_atom")) { TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); return TEST_OK; } TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); /* The type of second event is randome value */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); return TEST_OK; } @@ -1651,7 +1630,7 @@ static int test__hybrid_raw2(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); return TEST_OK; } -- cgit v1.2.3 From 4a7c4eafb74860fd7cb1eecbd2606cdd26809d0a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:20 -0700 Subject: perf test: Test more with config_cache test__checkevent_config_cache checks the parsing of "L1-dcache-misses/name=cachepmu/". Don't just check that the name is set correctly, also validate the rest of the perf_event_attr for L1-dcache-misses. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 4b00cb4aa73a..3f75f0315db8 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1406,7 +1406,7 @@ static int test__checkevent_config_cache(struct evlist *evlist) struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong name setting", evsel__name_is(evsel, "cachepmu")); - return TEST_OK; + return test__checkevent_genhw(evlist); } static bool test__pmu_cpu_valid(void) -- cgit v1.2.3 From a8af6e48c6220992430cd55713679f49c23ac6d2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:21 -0700 Subject: perf test: Roundtrip name, don't assume 1 event per name Opening hardware names and a legacy cache event on a hybrid PMU opens it on each PMU. Parsing and checking indexes fails, as the parsed index is double the expected. Avoid checking the index by just comparing the names immediately after the parse. This change removes hard coded hybrid logic and removes assumptions about the expansion of an event. On hybrid the PMUs may or may not support an event and so using a distance isn't a consistent solution. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/evsel-roundtrip-name.c | 119 +++++++++++++------------------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index e94fed901992..15ff86f9da0b 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -4,114 +4,93 @@ #include "parse-events.h" #include "tests.h" #include "debug.h" -#include "pmu.h" -#include "pmu-hybrid.h" -#include #include static int perf_evsel__roundtrip_cache_name_test(void) { - char name[128]; - int type, op, err = 0, ret = 0, i, idx; - struct evsel *evsel; - struct evlist *evlist = evlist__new(); + int ret = TEST_OK; - if (evlist == NULL) - return -ENOMEM; - - for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { - for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { + for (int type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { + for (int op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { /* skip invalid cache type */ if (!evsel__is_cache_op_valid(type, op)) continue; - for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - __evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); - err = parse_event(evlist, name); - if (err) - ret = err; - } - } - } - - idx = 0; - evsel = evlist__first(evlist); + for (int res = 0; res < PERF_COUNT_HW_CACHE_RESULT_MAX; res++) { + char name[128]; + struct evlist *evlist = evlist__new(); + struct evsel *evsel; + int err; - for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { - for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { - /* skip invalid cache type */ - if (!evsel__is_cache_op_valid(type, op)) - continue; + if (evlist == NULL) { + pr_debug("Failed to alloc evlist"); + return TEST_FAIL; + } + __evsel__hw_cache_type_op_res_name(type, op, res, + name, sizeof(name)); - for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { - __evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); - if (evsel->core.idx != idx) + err = parse_event(evlist, name); + if (err) { + pr_debug("Failure to parse cache event '%s' possibly as PMUs don't support it", + name); + evlist__delete(evlist); continue; - - ++idx; - - if (strcmp(evsel__name(evsel), name)) { - pr_debug("%s != %s\n", evsel__name(evsel), name); - ret = -1; } - - evsel = evsel__next(evsel); + evlist__for_each_entry(evlist, evsel) { + if (strcmp(evsel__name(evsel), name)) { + pr_debug("%s != %s\n", evsel__name(evsel), name); + ret = TEST_FAIL; + } + } + evlist__delete(evlist); } } } - - evlist__delete(evlist); return ret; } -static int __perf_evsel__name_array_test(const char *const names[], int nr_names, - int distance) +static int perf_evsel__name_array_test(const char *const names[], int nr_names) { - int i, err; - struct evsel *evsel; - struct evlist *evlist = evlist__new(); + int ret = TEST_OK; - if (evlist == NULL) - return -ENOMEM; + for (int i = 0; i < nr_names; ++i) { + struct evlist *evlist = evlist__new(); + struct evsel *evsel; + int err; - for (i = 0; i < nr_names; ++i) { + if (evlist == NULL) { + pr_debug("Failed to alloc evlist"); + return TEST_FAIL; + } err = parse_event(evlist, names[i]); if (err) { pr_debug("failed to parse event '%s', err %d\n", names[i], err); - goto out_delete_evlist; + evlist__delete(evlist); + ret = TEST_FAIL; + continue; } - } - - err = 0; - evlist__for_each_entry(evlist, evsel) { - if (strcmp(evsel__name(evsel), names[evsel->core.idx / distance])) { - --err; - pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->core.idx / distance]); + evlist__for_each_entry(evlist, evsel) { + if (strcmp(evsel__name(evsel), names[i])) { + pr_debug("%s != %s\n", evsel__name(evsel), names[i]); + ret = TEST_FAIL; + } } + evlist__delete(evlist); } - -out_delete_evlist: - evlist__delete(evlist); - return err; + return ret; } -#define perf_evsel__name_array_test(names, distance) \ - __perf_evsel__name_array_test(names, ARRAY_SIZE(names), distance) - static int test__perf_evsel__roundtrip_name_test(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - int err = 0, ret = 0; - - if (perf_pmu__has_hybrid() && perf_pmu__hybrid_mounted("cpu_atom")) - return perf_evsel__name_array_test(evsel__hw_names, 2); + int err = 0, ret = TEST_OK; - err = perf_evsel__name_array_test(evsel__hw_names, 1); + err = perf_evsel__name_array_test(evsel__hw_names, PERF_COUNT_HW_MAX); if (err) ret = err; - err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1, 1); + err = perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1); if (err) ret = err; -- cgit v1.2.3 From c9aeb2e9cc8ee02c6ad469a910a3aa9b083b76cf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:22 -0700 Subject: perf parse-events: Set attr.type to PMU type early Set attr.type to PMU type early so that later terms can override the value. Setting the value in perf_pmu__config means that earlier steps, like config_term_pmu, can override the value. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 2 +- tools/perf/util/pmu.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 34ba840ae19a..707c53f1be09 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1492,9 +1492,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, } else { memset(&attr, 0, sizeof(attr)); } + attr.type = pmu->type; if (!head_config) { - attr.type = pmu->type; evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true, /*name=*/NULL, /*metric_id=*/NULL, pmu, diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index ad209c88a124..cb33d869f1ed 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1398,7 +1398,6 @@ int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, { bool zero = !!pmu->default_config; - attr->type = pmu->type; return perf_pmu__config_terms(pmu->name, &pmu->format, attr, head_terms, zero, err); } -- cgit v1.2.3 From cae256ae75cf2d62187c0477e2e08da71d537fc5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:23 -0700 Subject: perf parse-events: Set pmu_name whenever a pmu is given Change add_event to always set pmu_name when possible as not all code checks both pmu->name and evsel->pmu_name, for example, uniquify_counter in stat-display.c. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 707c53f1be09..9cb5f040a74c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -269,6 +269,7 @@ __add_event(struct list_head *list, int *idx, evsel->core.requires_cpu = pmu ? pmu->is_uncore : false; evsel->auto_merge_stats = auto_merge_stats; evsel->pmu = pmu; + evsel->pmu_name = pmu && pmu->name ? strdup(pmu->name) : NULL; if (name) evsel->name = strdup(name); @@ -1500,12 +1501,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, /*metric_id=*/NULL, pmu, /*config_terms=*/NULL, auto_merge_stats, /*cpu_list=*/NULL); - if (evsel) { - evsel->pmu_name = name ? strdup(name) : NULL; - return 0; - } else { - return -ENOMEM; - } + return evsel ? 0 : -ENOMEM; } if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, head_config, &info)) @@ -1561,7 +1557,6 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (evsel->name) evsel->use_config_name = true; - evsel->pmu_name = name ? strdup(name) : NULL; evsel->percore = config_term_percore(&evsel->config_terms); if (parse_state->fake_pmu) -- cgit v1.2.3 From 442eeb77044705f2952a88a1c5d4a902f444bf02 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:24 -0700 Subject: perf print-events: Avoid unnecessary strlist The strlist in print_hwcache_events holds the event names as they are generated, and then it is iterated and printed. This is unnecessary and each event can just be printed as it is processed. Rename the variable i to res, to be more intention revealing and consistent with other code. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print-events.c | 60 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index ee145cec42c0..89ac34a922c9 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -230,58 +230,60 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state) int print_hwcache_events(const struct print_callbacks *print_cb, void *print_state) { - struct strlist *evt_name_list = strlist__new(NULL, NULL); - struct str_node *nd; + const char *event_type_descriptor = event_type_descriptors[PERF_TYPE_HW_CACHE]; - if (!evt_name_list) { - pr_debug("Failed to allocate new strlist for hwcache events\n"); - return -ENOMEM; - } for (int type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { for (int op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { /* skip invalid cache type */ if (!evsel__is_cache_op_valid(type, op)) continue; - for (int i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { + for (int res = 0; res < PERF_COUNT_HW_CACHE_RESULT_MAX; res++) { struct perf_pmu *pmu = NULL; char name[64]; - __evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); + __evsel__hw_cache_type_op_res_name(type, op, res, + name, sizeof(name)); if (!perf_pmu__has_hybrid()) { if (is_event_supported(PERF_TYPE_HW_CACHE, - type | (op << 8) | (i << 16))) - strlist__add(evt_name_list, name); + type | (op << 8) | (res << 16))) { + print_cb->print_event(print_state, + "cache", + /*pmu_name=*/NULL, + name, + /*event_alias=*/NULL, + /*scale_unit=*/NULL, + /*deprecated=*/false, + event_type_descriptor, + /*desc=*/NULL, + /*long_desc=*/NULL, + /*encoding_desc=*/NULL); + } continue; } perf_pmu__for_each_hybrid_pmu(pmu) { if (is_event_supported(PERF_TYPE_HW_CACHE, - type | (op << 8) | (i << 16) | + type | (op << 8) | (res << 16) | ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT))) { char new_name[128]; - snprintf(new_name, sizeof(new_name), - "%s/%s/", pmu->name, name); - strlist__add(evt_name_list, new_name); + snprintf(new_name, sizeof(new_name), + "%s/%s/", pmu->name, name); + print_cb->print_event(print_state, + "cache", + pmu->name, + name, + new_name, + /*scale_unit=*/NULL, + /*deprecated=*/false, + event_type_descriptor, + /*desc=*/NULL, + /*long_desc=*/NULL, + /*encoding_desc=*/NULL); } } } } } - - strlist__for_each_entry(nd, evt_name_list) { - print_cb->print_event(print_state, - "cache", - /*pmu_name=*/NULL, - nd->s, - /*event_alias=*/NULL, - /*scale_unit=*/NULL, - /*deprecated=*/false, - event_type_descriptors[PERF_TYPE_HW_CACHE], - /*desc=*/NULL, - /*long_desc=*/NULL, - /*encoding_desc=*/NULL); - } - strlist__delete(evt_name_list); return 0; } -- cgit v1.2.3 From 70c90e4a6b2fbe775b662eafefae51f64d627790 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:25 -0700 Subject: perf parse-events: Avoid scanning PMUs before parsing The event parser needs to handle two special cases: 1) legacy events like L1-dcache-load-miss. These event names don't appear in JSON or sysfs, and lookup tables are used for the config value. 2) raw events where 'r0xead' is the same as 'read' unless the PMU has an event called 'read' in which case the event has priority. The previous parser to handle these cases would scan all PMUs for components of event names. These components would then be used to classify in the lexer whether the token should be part of a legacy event, a raw event or an event. The grammar would handle legacy event tokens or recombining the tokens back into a regular event name. The code wasn't PMU specific and had issues around events like AMD's branch-brs that would fail to parse as it expects brs to be a suffix on a legacy event style name: $ perf stat -e branch-brs true event syntax error: 'branch-brs' \___ parser error This change removes processing all PMUs by using the lexer in the form of a regular expression matcher. The lexer will return the token for the longest matched sequence of characters, and in the event of a tie the first. The legacy events are a fixed number of regular expressions, and by matching these before a name token its possible to generate an accurate legacy event token with everything else matching as a name. Because of the lexer change the handling of hyphens in the grammar can be removed as hyphens just become a part of the name. To handle raw events and terms the parser is changed to defer trying to evaluate whether something is a raw event until the PMU is known in the grammar. Once the PMU is known, the events of the PMU can be scanned for the 'read' style problem. A new term type is added for these raw terms, used to enable deferring the evaluation. While this change is large, it has stats of: 170 insertions(+), 436 deletions(-) the bulk of the change is deleting the old approach. It isn't possible to break apart the code added due to the dependencies on how the parts of the parsing work. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 24 +-- tools/perf/tests/pmu-events.c | 9 -- tools/perf/util/parse-events.c | 329 +++++++++++++--------------------------- tools/perf/util/parse-events.h | 16 +- tools/perf/util/parse-events.l | 85 ++--------- tools/perf/util/parse-events.y | 143 ++++++----------- 6 files changed, 170 insertions(+), 436 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 3f75f0315db8..5ba90b32c524 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -676,11 +676,11 @@ static int test__checkterms_simple(struct list_head *terms) */ term = list_entry(term->list.next, struct parse_events_term, list); TEST_ASSERT_VAL("wrong type term", - term->type_term == PARSE_EVENTS__TERM_TYPE_USER); + term->type_term == PARSE_EVENTS__TERM_TYPE_RAW); TEST_ASSERT_VAL("wrong type val", - term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - TEST_ASSERT_VAL("wrong val", term->val.num == 1); - TEST_ASSERT_VAL("wrong config", !strcmp(term->config, "read")); + term->type_val == PARSE_EVENTS__TERM_TYPE_STR); + TEST_ASSERT_VAL("wrong val", !strcmp(term->val.str, "read")); + TEST_ASSERT_VAL("wrong config", !strcmp(term->config, "raw")); /* * r0xead @@ -690,11 +690,11 @@ static int test__checkterms_simple(struct list_head *terms) */ term = list_entry(term->list.next, struct parse_events_term, list); TEST_ASSERT_VAL("wrong type term", - term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG); + term->type_term == PARSE_EVENTS__TERM_TYPE_RAW); TEST_ASSERT_VAL("wrong type val", - term->type_val == PARSE_EVENTS__TERM_TYPE_NUM); - TEST_ASSERT_VAL("wrong val", term->val.num == 0xead); - TEST_ASSERT_VAL("wrong config", !strcmp(term->config, "config")); + term->type_val == PARSE_EVENTS__TERM_TYPE_STR); + TEST_ASSERT_VAL("wrong val", !strcmp(term->val.str, "r0xead")); + TEST_ASSERT_VAL("wrong config", !strcmp(term->config, "raw")); return TEST_OK; } @@ -2104,7 +2104,6 @@ static int test_event_fake_pmu(const char *str) return -ENOMEM; parse_events_error__init(&err); - perf_pmu__test_parse_init(); ret = __parse_events(evlist, str, &err, &perf_pmu__fake, /*warn_if_reordered=*/true); if (ret) { pr_debug("failed to parse event '%s', err %d, str '%s'\n", @@ -2158,13 +2157,6 @@ static int test_term(const struct terms_test *t) INIT_LIST_HEAD(&terms); - /* - * The perf_pmu__test_parse_init prepares perf_pmu_events_list - * which gets freed in parse_events_terms. - */ - if (perf_pmu__test_parse_init()) - return -1; - ret = parse_events_terms(&terms, t->str); if (ret) { pr_debug("failed to parse terms '%s', err %d\n", diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index 1dff863b9711..a2cde61b1c77 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -776,15 +776,6 @@ static int check_parse_id(const char *id, struct parse_events_error *error, for (cur = strchr(dup, '@') ; cur; cur = strchr(++cur, '@')) *cur = '/'; - if (fake_pmu) { - /* - * Every call to __parse_events will try to initialize the PMU - * state from sysfs and then clean it up at the end. Reset the - * PMU events to the test state so that we don't pick up - * erroneous prefixes and suffixes. - */ - perf_pmu__test_parse_init(); - } ret = __parse_events(evlist, dup, error, fake_pmu, /*warn_if_reordered=*/true); free(dup); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9cb5f040a74c..b5d95fce520c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -34,11 +34,6 @@ #define MAX_NAME_LEN 100 -struct perf_pmu_event_symbol { - char *symbol; - enum perf_pmu_event_symbol_type type; -}; - #ifdef PARSER_DEBUG extern int parse_events_debug; #endif @@ -49,15 +44,6 @@ static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state, const char *str, char *pmu_name, struct list_head *list); -static struct perf_pmu_event_symbol *perf_pmu_events_list; -/* - * The variable indicates the number of supported pmu event symbols. - * 0 means not initialized and ready to init - * -1 means failed to init, don't try anymore - * >0 is the number of supported pmu event symbols - */ -static int perf_pmu_events_list_num; - struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = { .symbol = "cpu-cycles", @@ -236,6 +222,57 @@ static char *get_config_name(struct list_head *head_terms) return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME); } +/** + * fix_raw - For each raw term see if there is an event (aka alias) in pmu that + * matches the raw's string value. If the string value matches an + * event then change the term to be an event, if not then change it to + * be a config term. For example, "read" may be an event of the PMU or + * a raw hex encoding of 0xead. The fix-up is done late so the PMU of + * the event can be determined and we don't need to scan all PMUs + * ahead-of-time. + * @config_terms: the list of terms that may contain a raw term. + * @pmu: the PMU to scan for events from. + */ +static void fix_raw(struct list_head *config_terms, struct perf_pmu *pmu) +{ + struct parse_events_term *term; + + list_for_each_entry(term, config_terms, list) { + struct perf_pmu_alias *alias; + bool matched = false; + + if (term->type_term != PARSE_EVENTS__TERM_TYPE_RAW) + continue; + + list_for_each_entry(alias, &pmu->aliases, list) { + if (!strcmp(alias->name, term->val.str)) { + free(term->config); + term->config = term->val.str; + term->type_val = PARSE_EVENTS__TERM_TYPE_NUM; + term->type_term = PARSE_EVENTS__TERM_TYPE_USER; + term->val.num = 1; + term->no_value = true; + matched = true; + break; + } + } + if (!matched) { + u64 num; + + free(term->config); + term->config = strdup("config"); + errno = 0; + num = strtoull(term->val.str + 1, NULL, 16); + assert(errno == 0); + free(term->val.str); + term->type_val = PARSE_EVENTS__TERM_TYPE_NUM; + term->type_term = PARSE_EVENTS__TERM_TYPE_CONFIG; + term->val.num = num; + term->no_value = false; + } + } +} + static struct evsel * __add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, @@ -329,18 +366,27 @@ static int add_event_tool(struct list_head *list, int *idx, return 0; } -static int parse_aliases(char *str, const char *const names[][EVSEL__MAX_ALIASES], int size) +/** + * parse_aliases - search names for entries beginning or equalling str ignoring + * case. If mutliple entries in names match str then the longest + * is chosen. + * @str: The needle to look for. + * @names: The haystack to search. + * @size: The size of the haystack. + * @longest: Out argument giving the length of the matching entry. + */ +static int parse_aliases(const char *str, const char *const names[][EVSEL__MAX_ALIASES], int size, + int *longest) { - int i, j; - int n, longest = -1; + *longest = -1; + for (int i = 0; i < size; i++) { + for (int j = 0; j < EVSEL__MAX_ALIASES && names[i][j]; j++) { + int n = strlen(names[i][j]); - for (i = 0; i < size; i++) { - for (j = 0; j < EVSEL__MAX_ALIASES && names[i][j]; j++) { - n = strlen(names[i][j]); - if (n > longest && !strncasecmp(str, names[i][j], n)) - longest = n; + if (n > *longest && !strncasecmp(str, names[i][j], n)) + *longest = n; } - if (longest > 0) + if (*longest > 0) return i; } @@ -358,52 +404,58 @@ static int config_attr(struct perf_event_attr *attr, struct parse_events_error *err, config_term_func_t config_term); -int parse_events_add_cache(struct list_head *list, int *idx, - char *type, char *op_result1, char *op_result2, +int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_error *err, struct list_head *head_config, struct parse_events_state *parse_state) { struct perf_event_attr attr; LIST_HEAD(config_terms); - char name[MAX_NAME_LEN]; const char *config_name, *metric_id; int cache_type = -1, cache_op = -1, cache_result = -1; - char *op_result[2] = { op_result1, op_result2 }; - int i, n, ret; + int ret, len; + const char *name_end = &name[strlen(name) + 1]; bool hybrid; + const char *str = name; /* - * No fallback - if we cannot get a clear cache type - * then bail out: + * Search str for the legacy cache event name composed of 1, 2 or 3 + * hyphen separated sections. The first section is the cache type while + * the others are the optional op and optional result. To make life hard + * the names in the table also contain hyphens and the longest name + * should always be selected. */ - cache_type = parse_aliases(type, evsel__hw_cache, PERF_COUNT_HW_CACHE_MAX); + cache_type = parse_aliases(str, evsel__hw_cache, PERF_COUNT_HW_CACHE_MAX, &len); if (cache_type == -1) return -EINVAL; + str += len + 1; config_name = get_config_name(head_config); - n = snprintf(name, MAX_NAME_LEN, "%s", type); - - for (i = 0; (i < 2) && (op_result[i]); i++) { - char *str = op_result[i]; - - n += snprintf(name + n, MAX_NAME_LEN - n, "-%s", str); - - if (cache_op == -1) { + if (str < name_end) { + cache_op = parse_aliases(str, evsel__hw_cache_op, + PERF_COUNT_HW_CACHE_OP_MAX, &len); + if (cache_op >= 0) { + if (!evsel__is_cache_op_valid(cache_type, cache_op)) + return -EINVAL; + str += len + 1; + } else { + cache_result = parse_aliases(str, evsel__hw_cache_result, + PERF_COUNT_HW_CACHE_RESULT_MAX, &len); + if (cache_result >= 0) + str += len + 1; + } + } + if (str < name_end) { + if (cache_op < 0) { cache_op = parse_aliases(str, evsel__hw_cache_op, - PERF_COUNT_HW_CACHE_OP_MAX); + PERF_COUNT_HW_CACHE_OP_MAX, &len); if (cache_op >= 0) { if (!evsel__is_cache_op_valid(cache_type, cache_op)) return -EINVAL; - continue; } - } - - if (cache_result == -1) { + } else if (cache_result < 0) { cache_result = parse_aliases(str, evsel__hw_cache_result, - PERF_COUNT_HW_CACHE_RESULT_MAX); - if (cache_result >= 0) - continue; + PERF_COUNT_HW_CACHE_RESULT_MAX, &len); } } @@ -969,6 +1021,7 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = { [PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT] = "aux-output", [PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE] = "aux-sample-size", [PARSE_EVENTS__TERM_TYPE_METRIC_ID] = "metric-id", + [PARSE_EVENTS__TERM_TYPE_RAW] = "raw", }; static bool config_term_shrinked; @@ -1090,6 +1143,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_METRIC_ID: CHECK_TYPE_VAL(STR); break; + case PARSE_EVENTS__TERM_TYPE_RAW: + CHECK_TYPE_VAL(STR); + break; case PARSE_EVENTS__TERM_TYPE_MAX_STACK: CHECK_TYPE_VAL(NUM); break; @@ -1486,6 +1542,8 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, parse_events_error__handle(err, 0, err_str, NULL); return -EINVAL; } + if (head_config) + fix_raw(head_config, pmu); if (pmu->default_config) { memcpy(&attr, pmu->default_config, @@ -1870,180 +1928,6 @@ int parse_events_name(struct list_head *list, const char *name) return 0; } -static int -comp_pmu(const void *p1, const void *p2) -{ - struct perf_pmu_event_symbol *pmu1 = (struct perf_pmu_event_symbol *) p1; - struct perf_pmu_event_symbol *pmu2 = (struct perf_pmu_event_symbol *) p2; - - return strcasecmp(pmu1->symbol, pmu2->symbol); -} - -static void perf_pmu__parse_cleanup(void) -{ - if (perf_pmu_events_list_num > 0) { - struct perf_pmu_event_symbol *p; - int i; - - for (i = 0; i < perf_pmu_events_list_num; i++) { - p = perf_pmu_events_list + i; - zfree(&p->symbol); - } - zfree(&perf_pmu_events_list); - perf_pmu_events_list_num = 0; - } -} - -#define SET_SYMBOL(str, stype) \ -do { \ - p->symbol = str; \ - if (!p->symbol) \ - goto err; \ - p->type = stype; \ -} while (0) - -/* - * Read the pmu events list from sysfs - * Save it into perf_pmu_events_list - */ -static void perf_pmu__parse_init(void) -{ - - struct perf_pmu *pmu = NULL; - struct perf_pmu_alias *alias; - int len = 0; - - pmu = NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - list_for_each_entry(alias, &pmu->aliases, list) { - char *tmp = strchr(alias->name, '-'); - - if (tmp) { - char *tmp2 = NULL; - - tmp2 = strchr(tmp + 1, '-'); - len++; - if (tmp2) - len++; - } - - len++; - } - } - - if (len == 0) { - perf_pmu_events_list_num = -1; - return; - } - perf_pmu_events_list = malloc(sizeof(struct perf_pmu_event_symbol) * len); - if (!perf_pmu_events_list) - return; - perf_pmu_events_list_num = len; - - len = 0; - pmu = NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - list_for_each_entry(alias, &pmu->aliases, list) { - struct perf_pmu_event_symbol *p = perf_pmu_events_list + len; - char *tmp = strchr(alias->name, '-'); - char *tmp2 = NULL; - - if (tmp) - tmp2 = strchr(tmp + 1, '-'); - if (tmp2) { - SET_SYMBOL(strndup(alias->name, tmp - alias->name), - PMU_EVENT_SYMBOL_PREFIX); - p++; - tmp++; - SET_SYMBOL(strndup(tmp, tmp2 - tmp), PMU_EVENT_SYMBOL_SUFFIX); - p++; - SET_SYMBOL(strdup(++tmp2), PMU_EVENT_SYMBOL_SUFFIX2); - len += 3; - } else if (tmp) { - SET_SYMBOL(strndup(alias->name, tmp - alias->name), - PMU_EVENT_SYMBOL_PREFIX); - p++; - SET_SYMBOL(strdup(++tmp), PMU_EVENT_SYMBOL_SUFFIX); - len += 2; - } else { - SET_SYMBOL(strdup(alias->name), PMU_EVENT_SYMBOL); - len++; - } - } - } - qsort(perf_pmu_events_list, len, - sizeof(struct perf_pmu_event_symbol), comp_pmu); - - return; -err: - perf_pmu__parse_cleanup(); -} - -/* - * This function injects special term in - * perf_pmu_events_list so the test code - * can check on this functionality. - */ -int perf_pmu__test_parse_init(void) -{ - struct perf_pmu_event_symbol *list, *tmp, symbols[] = { - {(char *)"read", PMU_EVENT_SYMBOL}, - {(char *)"event", PMU_EVENT_SYMBOL_PREFIX}, - {(char *)"two", PMU_EVENT_SYMBOL_SUFFIX}, - {(char *)"hyphen", PMU_EVENT_SYMBOL_SUFFIX}, - {(char *)"hyph", PMU_EVENT_SYMBOL_SUFFIX2}, - }; - unsigned long i, j; - - tmp = list = malloc(sizeof(*list) * ARRAY_SIZE(symbols)); - if (!list) - return -ENOMEM; - - for (i = 0; i < ARRAY_SIZE(symbols); i++, tmp++) { - tmp->type = symbols[i].type; - tmp->symbol = strdup(symbols[i].symbol); - if (!tmp->symbol) - goto err_free; - } - - perf_pmu_events_list = list; - perf_pmu_events_list_num = ARRAY_SIZE(symbols); - - qsort(perf_pmu_events_list, ARRAY_SIZE(symbols), - sizeof(struct perf_pmu_event_symbol), comp_pmu); - return 0; - -err_free: - for (j = 0, tmp = list; j < i; j++, tmp++) - zfree(&tmp->symbol); - free(list); - return -ENOMEM; -} - -enum perf_pmu_event_symbol_type -perf_pmu__parse_check(const char *name) -{ - struct perf_pmu_event_symbol p, *r; - - /* scan kernel pmu events from sysfs if needed */ - if (perf_pmu_events_list_num == 0) - perf_pmu__parse_init(); - /* - * name "cpu" could be prefix of cpu-cycles or cpu// events. - * cpu-cycles has been handled by hardcode. - * So it must be cpu// events, not kernel pmu event. - */ - if ((perf_pmu_events_list_num <= 0) || !strcmp(name, "cpu")) - return PMU_EVENT_SYMBOL_ERR; - - p.symbol = strdup(name); - r = bsearch(&p, perf_pmu_events_list, - (size_t) perf_pmu_events_list_num, - sizeof(struct perf_pmu_event_symbol), comp_pmu); - zfree(&p.symbol); - return r ? r->type : PMU_EVENT_SYMBOL_ERR; -} - static int parse_events__scanner(const char *str, struct parse_events_state *parse_state) { @@ -2081,7 +1965,6 @@ int parse_events_terms(struct list_head *terms, const char *str) int ret; ret = parse_events__scanner(str, &parse_state); - perf_pmu__parse_cleanup(); if (!ret) { list_splice(parse_state.terms, terms); @@ -2106,7 +1989,6 @@ static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state, int ret; ret = parse_events__scanner(str, &ps); - perf_pmu__parse_cleanup(); if (!ret) { if (!list_empty(&ps.list)) { @@ -2269,7 +2151,6 @@ int __parse_events(struct evlist *evlist, const char *str, int ret; ret = parse_events__scanner(str, &parse_state); - perf_pmu__parse_cleanup(); if (!ret && list_empty(&parse_state.list)) { WARN_ONCE(true, "WARNING: event parser found nothing\n"); diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 86ad4438a2aa..f638542c8638 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -41,14 +41,6 @@ int parse_events_terms(struct list_head *terms, const char *str); int parse_filter(const struct option *opt, const char *str, int unset); int exclude_perf(const struct option *opt, const char *arg, int unset); -enum perf_pmu_event_symbol_type { - PMU_EVENT_SYMBOL_ERR, /* not a PMU EVENT */ - PMU_EVENT_SYMBOL, /* normal style PMU event */ - PMU_EVENT_SYMBOL_PREFIX, /* prefix of pre-suf style event */ - PMU_EVENT_SYMBOL_SUFFIX, /* suffix of pre-suf style event */ - PMU_EVENT_SYMBOL_SUFFIX2, /* suffix of pre-suf2 style event */ -}; - enum { PARSE_EVENTS__TERM_TYPE_NUM, PARSE_EVENTS__TERM_TYPE_STR, @@ -78,6 +70,7 @@ enum { PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE, PARSE_EVENTS__TERM_TYPE_METRIC_ID, + PARSE_EVENTS__TERM_TYPE_RAW, __PARSE_EVENTS__TERM_TYPE_NR, }; @@ -174,8 +167,7 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, int parse_events_add_tool(struct parse_events_state *parse_state, struct list_head *list, int tool_event); -int parse_events_add_cache(struct list_head *list, int *idx, - char *type, char *op_result1, char *op_result2, +int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_error *error, struct list_head *head_config, struct parse_events_state *parse_state); @@ -198,8 +190,6 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, int parse_events_copy_term_list(struct list_head *old, struct list_head **new); -enum perf_pmu_event_symbol_type -perf_pmu__parse_check(const char *name); void parse_events__set_leader(char *name, struct list_head *list); void parse_events_update_lists(struct list_head *list_event, struct list_head *list_all); @@ -241,8 +231,6 @@ static inline bool is_sdt_event(char *str __maybe_unused) } #endif /* HAVE_LIBELF_SUPPORT */ -int perf_pmu__test_parse_init(void); - struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx, struct perf_event_attr *attr, const char *name, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 51fe0a9fb3de..4b35c099189a 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -63,17 +63,6 @@ static int str(yyscan_t scanner, int token) return token; } -static int raw(yyscan_t scanner) -{ - YYSTYPE *yylval = parse_events_get_lval(scanner); - char *text = parse_events_get_text(scanner); - - if (perf_pmu__parse_check(text) == PMU_EVENT_SYMBOL) - return str(scanner, PE_NAME); - - return __value(yylval, text + 1, 16, PE_RAW); -} - static bool isbpf_suffix(char *text) { int len = strlen(text); @@ -131,35 +120,6 @@ do { \ yyless(0); \ } while (0) -static int pmu_str_check(yyscan_t scanner, struct parse_events_state *parse_state) -{ - YYSTYPE *yylval = parse_events_get_lval(scanner); - char *text = parse_events_get_text(scanner); - - yylval->str = strdup(text); - - /* - * If we're not testing then parse check determines the PMU event type - * which if it isn't a PMU returns PE_NAME. When testing the result of - * parse check can't be trusted so we return PE_PMU_EVENT_FAKE unless - * an '!' is present in which case the text can't be a PMU name. - */ - switch (perf_pmu__parse_check(text)) { - case PMU_EVENT_SYMBOL_PREFIX: - return PE_PMU_EVENT_PRE; - case PMU_EVENT_SYMBOL_SUFFIX: - return PE_PMU_EVENT_SUF; - case PMU_EVENT_SYMBOL_SUFFIX2: - return PE_PMU_EVENT_SUF2; - case PMU_EVENT_SYMBOL: - return parse_state->fake_pmu - ? PE_PMU_EVENT_FAKE : PE_KERNEL_PMU_EVENT; - default: - return parse_state->fake_pmu && !strchr(text,'!') - ? PE_PMU_EVENT_FAKE : PE_NAME; - } -} - static int sym(yyscan_t scanner, int type, int config) { YYSTYPE *yylval = parse_events_get_lval(scanner); @@ -211,13 +171,15 @@ bpf_source [^,{}]+\.c[a-zA-Z0-9._]* num_dec [0-9]+ num_hex 0x[a-fA-F0-9]+ num_raw_hex [a-fA-F0-9]+ -name [a-zA-Z_*?\[\]][a-zA-Z0-9_*?.\[\]!]* +name [a-zA-Z_*?\[\]][a-zA-Z0-9_*?.\[\]!\-]* name_tag [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* If you add a modifier you need to update check_modifier() */ modifier_event [ukhpPGHSDIWeb]+ modifier_bp [rwx]{1,3} +lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node) +lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss) %% @@ -303,8 +265,8 @@ percore { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); } aux-output { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } metric-id { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } -r{num_raw_hex} { return raw(yyscanner); } -r0x{num_raw_hex} { return raw(yyscanner); } +r{num_raw_hex} { return str(yyscanner, PE_RAW); } +r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } {name_minus} { return str(yyscanner, PE_NAME); } @@ -359,47 +321,20 @@ system_time { return tool(yyscanner, PERF_TOOL_SYSTEM_TIME); } bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CGROUP_SWITCHES); } - /* - * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately. - * Because the prefix cycles is mixed up with cpu-cycles. - * loads and stores are mixed up with cache event - */ -cycles-ct | -cycles-t | -mem-loads | -mem-loads-aux | -mem-stores | -topdown-[a-z-]+ | -tx-capacity-[a-z-]+ | -el-capacity-[a-z-]+ { return str(yyscanner, PE_KERNEL_PMU_EVENT); } - -L1-dcache|l1-d|l1d|L1-data | -L1-icache|l1-i|l1i|L1-instruction | -LLC|L2 | -dTLB|d-tlb|Data-TLB | -iTLB|i-tlb|Instruction-TLB | -branch|branches|bpu|btb|bpc | -node { return str(yyscanner, PE_NAME_CACHE_TYPE); } - -load|loads|read | -store|stores|write | -prefetch|prefetches | -speculative-read|speculative-load | -refs|Reference|ops|access | -misses|miss { return str(yyscanner, PE_NAME_CACHE_OP_RESULT); } - +{lc_type} { return str(yyscanner, PE_LEGACY_CACHE); } +{lc_type}-{lc_op_result} { return str(yyscanner, PE_LEGACY_CACHE); } +{lc_type}-{lc_op_result}-{lc_op_result} { return str(yyscanner, PE_LEGACY_CACHE); } mem: { BEGIN(mem); return PE_PREFIX_MEM; } -r{num_raw_hex} { return raw(yyscanner); } +r{num_raw_hex} { return str(yyscanner, PE_RAW); } {num_dec} { return value(yyscanner, 10); } {num_hex} { return value(yyscanner, 16); } {modifier_event} { return str(yyscanner, PE_MODIFIER_EVENT); } {bpf_object} { if (!isbpf(yyscanner)) { USER_REJECT }; return str(yyscanner, PE_BPF_OBJECT); } {bpf_source} { if (!isbpf(yyscanner)) { USER_REJECT }; return str(yyscanner, PE_BPF_SOURCE); } -{name} { return pmu_str_check(yyscanner, _parse_state); } +{name} { return str(yyscanner, PE_NAME); } {name_tag} { return str(yyscanner, PE_NAME); } "/" { BEGIN(config); return '/'; } -- { return '-'; } , { BEGIN(event); return ','; } : { return ':'; } "{" { BEGIN(event); return '{'; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 4488443e506e..e7072b5601c5 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -8,6 +8,7 @@ #define YYDEBUG 1 +#include #include #include #include @@ -52,36 +53,35 @@ static void free_list_evsel(struct list_head* list_evsel) %} %token PE_START_EVENTS PE_START_TERMS -%token PE_VALUE PE_VALUE_SYM_HW PE_VALUE_SYM_SW PE_RAW PE_TERM +%token PE_VALUE PE_VALUE_SYM_HW PE_VALUE_SYM_SW PE_TERM %token PE_VALUE_SYM_TOOL %token PE_EVENT_NAME -%token PE_NAME +%token PE_RAW PE_NAME %token PE_BPF_OBJECT PE_BPF_SOURCE %token PE_MODIFIER_EVENT PE_MODIFIER_BP -%token PE_NAME_CACHE_TYPE PE_NAME_CACHE_OP_RESULT +%token PE_LEGACY_CACHE %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP %token PE_ERROR -%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE +%token PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %token PE_ARRAY_ALL PE_ARRAY_RANGE %token PE_DRV_CFG_TERM %type PE_VALUE %type PE_VALUE_SYM_HW %type PE_VALUE_SYM_SW %type PE_VALUE_SYM_TOOL -%type PE_RAW %type PE_TERM %type value_sym +%type PE_RAW %type PE_NAME %type PE_BPF_OBJECT %type PE_BPF_SOURCE -%type PE_NAME_CACHE_TYPE -%type PE_NAME_CACHE_OP_RESULT +%type PE_LEGACY_CACHE %type PE_MODIFIER_EVENT %type PE_MODIFIER_BP %type PE_EVENT_NAME -%type PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE +%type PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type PE_DRV_CFG_TERM -%type event_pmu_name +%type name_or_raw %destructor { free ($$); } %type event_term %destructor { parse_events_term__delete ($$); } @@ -273,11 +273,8 @@ event_def: event_pmu | event_legacy_raw sep_dc | event_bpf_file -event_pmu_name: -PE_NAME | PE_PMU_EVENT_PRE - event_pmu: -event_pmu_name opt_pmu_config +PE_NAME opt_pmu_config { struct parse_events_state *parse_state = _parse_state; struct parse_events_error *error = parse_state->error; @@ -303,10 +300,12 @@ event_pmu_name opt_pmu_config list = alloc_list(); if (!list) CLEANUP_YYABORT; + /* Attempt to add to list assuming $1 is a PMU name. */ if (parse_events_add_pmu(_parse_state, list, $1, $2, /*auto_merge_stats=*/false)) { struct perf_pmu *pmu = NULL; int ok = 0; + /* Failure to add, try wildcard expansion of $1 as a PMU name. */ if (asprintf(&pattern, "%s*", $1) < 0) CLEANUP_YYABORT; @@ -329,6 +328,12 @@ event_pmu_name opt_pmu_config } } + if (!ok) { + /* Failure to add, assume $1 is an event name. */ + zfree(&list); + ok = !parse_events_multi_pmu_add(_parse_state, $1, $2, &list); + $2 = NULL; + } if (!ok) CLEANUP_YYABORT; } @@ -352,41 +357,27 @@ PE_KERNEL_PMU_EVENT sep_dc $$ = list; } | -PE_KERNEL_PMU_EVENT opt_pmu_config +PE_NAME sep_dc { struct list_head *list; int err; - /* frees $2 */ - err = parse_events_multi_pmu_add(_parse_state, $1, $2, &list); + err = parse_events_multi_pmu_add(_parse_state, $1, NULL, &list); free($1); if (err < 0) YYABORT; $$ = list; } | -PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF '-' PE_PMU_EVENT_SUF2 sep_dc -{ - struct list_head *list; - char pmu_name[128]; - snprintf(pmu_name, sizeof(pmu_name), "%s-%s-%s", $1, $3, $5); - free($1); - free($3); - free($5); - if (parse_events_multi_pmu_add(_parse_state, pmu_name, NULL, &list) < 0) - YYABORT; - $$ = list; -} -| -PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc +PE_KERNEL_PMU_EVENT opt_pmu_config { struct list_head *list; - char pmu_name[128]; + int err; - snprintf(pmu_name, sizeof(pmu_name), "%s-%s", $1, $3); + /* frees $2 */ + err = parse_events_multi_pmu_add(_parse_state, $1, $2, &list); free($1); - free($3); - if (parse_events_multi_pmu_add(_parse_state, pmu_name, NULL, &list) < 0) + if (err < 0) YYABORT; $$ = list; } @@ -476,7 +467,7 @@ PE_VALUE_SYM_TOOL sep_slash_slash_dc } event_legacy_cache: -PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_event_config +PE_LEGACY_CACHE opt_event_config { struct parse_events_state *parse_state = _parse_state; struct parse_events_error *error = parse_state->error; @@ -485,51 +476,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_e list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6, - parse_state); - parse_events_terms__delete($6); - free($1); - free($3); - free($5); - if (err) { - free_list_evsel(list); - YYABORT; - } - $$ = list; -} -| -PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config -{ - struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; - struct list_head *list; - int err; + err = parse_events_add_cache(list, &parse_state->idx, $1, error, $2, parse_state); - list = alloc_list(); - ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4, - parse_state); - parse_events_terms__delete($4); - free($1); - free($3); - if (err) { - free_list_evsel(list); - YYABORT; - } - $$ = list; -} -| -PE_NAME_CACHE_TYPE opt_event_config -{ - struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; - struct list_head *list; - int err; - - list = alloc_list(); - ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2, - parse_state); parse_events_terms__delete($2); free($1); if (err) { @@ -633,17 +581,6 @@ tracepoint_name opt_event_config } tracepoint_name: -PE_NAME '-' PE_NAME ':' PE_NAME -{ - struct tracepoint_name tracepoint; - - ABORT_ON(asprintf(&tracepoint.sys, "%s-%s", $1, $3) < 0); - tracepoint.event = $5; - free($1); - free($3); - $$ = tracepoint; -} -| PE_NAME ':' PE_NAME { struct tracepoint_name tracepoint = {$1, $3}; @@ -673,10 +610,15 @@ PE_RAW opt_event_config { struct list_head *list; int err; + u64 num; list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_numeric(_parse_state, list, PERF_TYPE_RAW, $1, $2); + errno = 0; + num = strtoull($1 + 1, NULL, 16); + ABORT_ON(errno); + free($1); + err = parse_events_add_numeric(_parse_state, list, PERF_TYPE_RAW, num, $2); parse_events_terms__delete($2); if (err) { free(list); @@ -781,17 +723,22 @@ event_term $$ = head; } +name_or_raw: PE_RAW | PE_NAME + event_term: PE_RAW { struct parse_events_term *term; - ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_CONFIG, - NULL, $1, false, &@1, NULL)); + if (parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_RAW, + strdup("raw"), $1, &@1, &@1)) { + free($1); + YYABORT; + } $$ = term; } | -PE_NAME '=' PE_NAME +name_or_raw '=' PE_NAME { struct parse_events_term *term; @@ -804,7 +751,7 @@ PE_NAME '=' PE_NAME $$ = term; } | -PE_NAME '=' PE_VALUE +name_or_raw '=' PE_VALUE { struct parse_events_term *term; @@ -816,7 +763,7 @@ PE_NAME '=' PE_VALUE $$ = term; } | -PE_NAME '=' PE_VALUE_SYM_HW +name_or_raw '=' PE_VALUE_SYM_HW { struct parse_events_term *term; int config = $3 & 255; @@ -876,7 +823,7 @@ PE_TERM $$ = term; } | -PE_NAME array '=' PE_NAME +name_or_raw array '=' PE_NAME { struct parse_events_term *term; @@ -891,7 +838,7 @@ PE_NAME array '=' PE_NAME $$ = term; } | -PE_NAME array '=' PE_VALUE +name_or_raw array '=' PE_VALUE { struct parse_events_term *term; -- cgit v1.2.3 From ae4aa00a1a9358e0007f6edc71b018a0b0d21190 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:27 -0700 Subject: perf test: Move x86 hybrid tests to arch/x86 The tests use x86 hybrid specific PMUs. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-21-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/include/arch-tests.h | 1 + tools/perf/arch/x86/tests/Build | 1 + tools/perf/arch/x86/tests/arch-tests.c | 10 ++ tools/perf/arch/x86/tests/hybrid.c | 277 +++++++++++++++++++++++++++++++ tools/perf/tests/parse-events.c | 181 -------------------- 5 files changed, 289 insertions(+), 181 deletions(-) create mode 100644 tools/perf/arch/x86/tests/hybrid.c diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h index 902e9ea9b99e..33d39c1d3e64 100644 --- a/tools/perf/arch/x86/include/arch-tests.h +++ b/tools/perf/arch/x86/include/arch-tests.h @@ -11,6 +11,7 @@ int test__intel_pt_pkt_decoder(struct test_suite *test, int subtest); int test__intel_pt_hybrid_compat(struct test_suite *test, int subtest); int test__bp_modify(struct test_suite *test, int subtest); int test__x86_sample_parsing(struct test_suite *test, int subtest); +int test__hybrid(struct test_suite *test, int subtest); extern struct test_suite *arch_tests[]; diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index 6f4e8636c3bf..08cc8b9c931e 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -3,5 +3,6 @@ perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o perf-y += arch-tests.o perf-y += sample-parsing.o +perf-y += hybrid.o perf-$(CONFIG_AUXTRACE) += insn-x86.o intel-pt-test.o perf-$(CONFIG_X86_64) += bp-modify.o diff --git a/tools/perf/arch/x86/tests/arch-tests.c b/tools/perf/arch/x86/tests/arch-tests.c index aae6ea0fe52b..147ad0638bbb 100644 --- a/tools/perf/arch/x86/tests/arch-tests.c +++ b/tools/perf/arch/x86/tests/arch-tests.c @@ -22,6 +22,15 @@ struct test_suite suite__intel_pt = { DEFINE_SUITE("x86 bp modify", bp_modify); #endif DEFINE_SUITE("x86 Sample parsing", x86_sample_parsing); +static struct test_case hybrid_tests[] = { + TEST_CASE_REASON("x86 hybrid event parsing", hybrid, "not hybrid"), + { .name = NULL, } +}; + +struct test_suite suite__hybrid = { + .desc = "x86 hybrid", + .test_cases = hybrid_tests, +}; struct test_suite *arch_tests[] = { #ifdef HAVE_DWARF_UNWIND_SUPPORT @@ -35,5 +44,6 @@ struct test_suite *arch_tests[] = { &suite__bp_modify, #endif &suite__x86_sample_parsing, + &suite__hybrid, NULL, }; diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c new file mode 100644 index 000000000000..0f99cfd116ee --- /dev/null +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "arch-tests.h" +#include "debug.h" +#include "evlist.h" +#include "evsel.h" +#include "pmu-hybrid.h" +#include "tests/tests.h" + +static bool test_config(const struct evsel *evsel, __u64 expected_config) +{ + return (evsel->core.attr.config & PERF_HW_EVENT_MASK) == expected_config; +} + +static int test__hybrid_hw_event_with_pmu(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + return TEST_OK; +} + +static int test__hybrid_hw_group_event(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + return TEST_OK; +} + +static int test__hybrid_sw_hw_group_event(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + return TEST_OK; +} + +static int test__hybrid_hw_sw_group_event(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + return TEST_OK; +} + +static int test__hybrid_group_modifier1(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + return TEST_OK; +} + +static int test__hybrid_raw1(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + if (!perf_pmu__hybrid_mounted("cpu_atom")) { + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + return TEST_OK; + } + + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + + /* The type of second event is randome value */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + return TEST_OK; +} + +static int test__hybrid_raw2(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + return TEST_OK; +} + +static int test__hybrid_cache_event(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x2 == (evsel->core.attr.config & 0xffffffff)); + return TEST_OK; +} + +static int test__checkevent_pmu(struct evlist *evlist) +{ + + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 10 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong config1", 1 == evsel->core.attr.config1); + TEST_ASSERT_VAL("wrong config2", 3 == evsel->core.attr.config2); + TEST_ASSERT_VAL("wrong config3", 0 == evsel->core.attr.config3); + /* + * The period value gets configured within evlist__config, + * while this test executes only parse events method. + */ + TEST_ASSERT_VAL("wrong period", 0 == evsel->core.attr.sample_period); + + return TEST_OK; +} + +struct evlist_test { + const char *name; + bool (*valid)(void); + int (*check)(struct evlist *evlist); +}; + +static const struct evlist_test test__hybrid_events[] = { + { + .name = "cpu_core/cpu-cycles/", + .check = test__hybrid_hw_event_with_pmu, + /* 0 */ + }, + { + .name = "{cpu_core/cpu-cycles/,cpu_core/instructions/}", + .check = test__hybrid_hw_group_event, + /* 1 */ + }, + { + .name = "{cpu-clock,cpu_core/cpu-cycles/}", + .check = test__hybrid_sw_hw_group_event, + /* 2 */ + }, + { + .name = "{cpu_core/cpu-cycles/,cpu-clock}", + .check = test__hybrid_hw_sw_group_event, + /* 3 */ + }, + { + .name = "{cpu_core/cpu-cycles/k,cpu_core/instructions/u}", + .check = test__hybrid_group_modifier1, + /* 4 */ + }, + { + .name = "r1a", + .check = test__hybrid_raw1, + /* 5 */ + }, + { + .name = "cpu_core/r1a/", + .check = test__hybrid_raw2, + /* 6 */ + }, + { + .name = "cpu_core/config=10,config1,config2=3,period=1000/u", + .check = test__checkevent_pmu, + /* 7 */ + }, + { + .name = "cpu_core/LLC-loads/", + .check = test__hybrid_cache_event, + /* 8 */ + }, +}; + +static int test_event(const struct evlist_test *e) +{ + struct parse_events_error err; + struct evlist *evlist; + int ret; + + if (e->valid && !e->valid()) { + pr_debug("... SKIP\n"); + return TEST_OK; + } + + evlist = evlist__new(); + if (evlist == NULL) { + pr_err("Failed allocation"); + return TEST_FAIL; + } + parse_events_error__init(&err); + ret = parse_events(evlist, e->name, &err); + if (ret) { + pr_debug("failed to parse event '%s', err %d, str '%s'\n", + e->name, ret, err.str); + parse_events_error__print(&err, e->name); + ret = TEST_FAIL; + if (strstr(err.str, "can't access trace events")) + ret = TEST_SKIP; + } else { + ret = e->check(evlist); + } + parse_events_error__exit(&err); + evlist__delete(evlist); + + return ret; +} + +static int combine_test_results(int existing, int latest) +{ + if (existing == TEST_FAIL) + return TEST_FAIL; + if (existing == TEST_SKIP) + return latest == TEST_OK ? TEST_SKIP : latest; + return latest; +} + +static int test_events(const struct evlist_test *events, int cnt) +{ + int ret = TEST_OK; + + for (int i = 0; i < cnt; i++) { + const struct evlist_test *e = &events[i]; + int test_ret; + + pr_debug("running test %d '%s'\n", i, e->name); + test_ret = test_event(e); + if (test_ret != TEST_OK) { + pr_debug("Event test failure: test %d '%s'", i, e->name); + ret = combine_test_results(ret, test_ret); + } + } + + return ret; +} + +int test__hybrid(struct test_suite *test __maybe_unused, int subtest __maybe_unused) +{ + if (!perf_pmu__has_hybrid()) + return TEST_SKIP; + + return test_events(test__hybrid_events, ARRAY_SIZE(test__hybrid_events)); +} diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 5ba90b32c524..43c0778983e5 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -6,7 +6,6 @@ #include "tests.h" #include "debug.h" #include "pmu.h" -#include "pmu-hybrid.h" #include "pmus.h" #include #include @@ -1523,127 +1522,6 @@ static int test__all_tracepoints(struct evlist *evlist) } #endif /* HAVE_LIBTRACEVENT */ -static int test__hybrid_hw_event_with_pmu(struct evlist *evlist) -{ - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); - return TEST_OK; -} - -static int test__hybrid_hw_group_event(struct evlist *evlist) -{ - struct evsel *evsel, *leader; - - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - return TEST_OK; -} - -static int test__hybrid_sw_hw_group_event(struct evlist *evlist) -{ - struct evsel *evsel, *leader; - - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - return TEST_OK; -} - -static int test__hybrid_hw_sw_group_event(struct evlist *evlist) -{ - struct evsel *evsel, *leader; - - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - return TEST_OK; -} - -static int test__hybrid_group_modifier1(struct evlist *evlist) -{ - struct evsel *evsel, *leader; - - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - return TEST_OK; -} - -static int test__hybrid_raw1(struct evlist *evlist) -{ - struct evsel *evsel = evlist__first(evlist); - - if (!perf_pmu__hybrid_mounted("cpu_atom")) { - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); - return TEST_OK; - } - - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); - - /* The type of second event is randome value */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); - return TEST_OK; -} - -static int test__hybrid_raw2(struct evlist *evlist) -{ - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); - return TEST_OK; -} - -static int test__hybrid_cache_event(struct evlist *evlist) -{ - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", 0x2 == (evsel->core.attr.config & 0xffffffff)); - return TEST_OK; -} - struct evlist_test { const char *name; bool (*valid)(void); @@ -2011,54 +1889,6 @@ static const struct terms_test test__terms[] = { }, }; -static const struct evlist_test test__hybrid_events[] = { - { - .name = "cpu_core/cpu-cycles/", - .check = test__hybrid_hw_event_with_pmu, - /* 0 */ - }, - { - .name = "{cpu_core/cpu-cycles/,cpu_core/instructions/}", - .check = test__hybrid_hw_group_event, - /* 1 */ - }, - { - .name = "{cpu-clock,cpu_core/cpu-cycles/}", - .check = test__hybrid_sw_hw_group_event, - /* 2 */ - }, - { - .name = "{cpu_core/cpu-cycles/,cpu-clock}", - .check = test__hybrid_hw_sw_group_event, - /* 3 */ - }, - { - .name = "{cpu_core/cpu-cycles/k,cpu_core/instructions/u}", - .check = test__hybrid_group_modifier1, - /* 4 */ - }, - { - .name = "r1a", - .check = test__hybrid_raw1, - /* 5 */ - }, - { - .name = "cpu_core/r1a/", - .check = test__hybrid_raw2, - /* 6 */ - }, - { - .name = "cpu_core/config=10,config1,config2=3,period=1000/u", - .check = test__checkevent_pmu, - /* 7 */ - }, - { - .name = "cpu_core/LLC-loads/", - .check = test__hybrid_cache_event, - /* 8 */ - }, -}; - static int test_event(const struct evlist_test *e) { struct parse_events_error err; @@ -2337,14 +2167,6 @@ static bool test_alias(char **event, char **alias) return false; } -static int test__hybrid(struct test_suite *test __maybe_unused, int subtest __maybe_unused) -{ - if (!perf_pmu__has_hybrid()) - return TEST_SKIP; - - return test_events(test__hybrid_events, ARRAY_SIZE(test__hybrid_events)); -} - static int test__checkevent_pmu_events_alias(struct evlist *evlist) { struct evsel *evsel1 = evlist__first(evlist); @@ -2408,9 +2230,6 @@ static struct test_case tests__parse_events[] = { TEST_CASE_REASON("Test event parsing", events2, "permissions"), - TEST_CASE_REASON("Test parsing of \"hybrid\" CPU events", - hybrid, - "not hybrid"), TEST_CASE_REASON("Parsing of all PMU events from sysfs", pmu_events, "permissions"), -- cgit v1.2.3 From 8d8632887d74d06df5ef370427a8e2856e011546 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:28 -0700 Subject: perf test x86 hybrid: Update test expectations Don't assume evlist order. Switch to a loop rather than depend on evlist order for raw events test. Update hybrid event expectations. Previous values were based on parsing legacy hardware events from sysfs, update to the correct PMU specific legacy values. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/hybrid.c | 54 ++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index 0f99cfd116ee..d2a173ba3db9 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -11,13 +11,18 @@ static bool test_config(const struct evsel *evsel, __u64 expected_config) return (evsel->core.attr.config & PERF_HW_EVENT_MASK) == expected_config; } +static bool test_perf_config(const struct perf_evsel *evsel, __u64 expected_config) +{ + return (evsel->attr.config & PERF_HW_EVENT_MASK) == expected_config; +} + static int test__hybrid_hw_event_with_pmu(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); return TEST_OK; } @@ -27,13 +32,13 @@ static int test__hybrid_hw_group_event(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; } @@ -48,8 +53,8 @@ static int test__hybrid_sw_hw_group_event(struct evlist *evlist) TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; } @@ -60,8 +65,8 @@ static int test__hybrid_hw_sw_group_event(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); evsel = evsel__next(evsel); @@ -76,15 +81,15 @@ static int test__hybrid_group_modifier1(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x3c)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0xc0)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -93,22 +98,15 @@ static int test__hybrid_group_modifier1(struct evlist *evlist) static int test__hybrid_raw1(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct perf_evsel *evsel; - if (!perf_pmu__hybrid_mounted("cpu_atom")) { - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); - return TEST_OK; - } - - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + perf_evlist__for_each_evsel(&evlist->core, evsel) { + struct perf_pmu *pmu = perf_pmu__find_by_type(evsel->attr.type); - /* The type of second event is randome value */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + TEST_ASSERT_VAL("missing pmu", pmu); + TEST_ASSERT_VAL("unexpected pmu", !strncmp(pmu->name, "cpu_", 4)); + TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, 0x1a)); + } return TEST_OK; } -- cgit v1.2.3 From 68911aef3d76e74594b8f2dd018693c57d435355 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:29 -0700 Subject: perf test x86 hybrid: Add hybrid extended type checks Assert hybrid extended types are as expected. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-23-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/hybrid.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index d2a173ba3db9..941a9edfed4e 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -16,12 +16,18 @@ static bool test_perf_config(const struct perf_evsel *evsel, __u64 expected_conf return (evsel->attr.config & PERF_HW_EVENT_MASK) == expected_config; } +static bool test_hybrid_type(const struct evsel *evsel, __u64 expected_config) +{ + return (evsel->core.attr.config >> PERF_PMU_TYPE_SHIFT) == expected_config; +} + static int test__hybrid_hw_event_with_pmu(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); return TEST_OK; } @@ -33,11 +39,13 @@ static int test__hybrid_hw_group_event(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; @@ -54,6 +62,7 @@ static int test__hybrid_sw_hw_group_event(struct evlist *evlist) evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; @@ -66,6 +75,7 @@ static int test__hybrid_hw_sw_group_event(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); @@ -82,6 +92,7 @@ static int test__hybrid_group_modifier1(struct evlist *evlist) evsel = leader = evlist__first(evlist); TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); @@ -89,6 +100,7 @@ static int test__hybrid_group_modifier1(struct evlist *evlist) evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); -- cgit v1.2.3 From 6fd1e5191591f9d55afe4d23fa35af2a5cf8f81f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:30 -0700 Subject: perf parse-events: Support PMUs for legacy cache events Allow a legacy cache event to be both, for example, "L1-dcache-load-miss" and "cpu/L1-dcache-load-miss/" by introducing a new legacy cache term type. The term type is processed in config_term_pmu, setting both the type in perf_event_attr and the config. The code to determine the config is factored out of parse_events_add_cache and shared. If the PMU doesn't support legacy events, currently just core/hybrid PMUs do, then the term is treated like a PE_NAME term - as before. If only terms are being parsed, such as for perf_pmu__new_alias, then the PE_LEGACY_CACHE token is always parsed as PE_NAME. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-24-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 18 +++++++++++ tools/perf/util/parse-events.c | 70 ++++++++++++++++++++++++++++------------- tools/perf/util/parse-events.h | 3 ++ tools/perf/util/parse-events.l | 9 +++++- tools/perf/util/parse-events.y | 14 ++++++++- tools/perf/util/pmu.c | 5 +++ tools/perf/util/pmu.h | 1 + 7 files changed, 96 insertions(+), 24 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 43c0778983e5..c3afd0b129bb 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1875,6 +1875,24 @@ static const struct evlist_test test__events_pmu[] = { .check = test__checkevent_raw_pmu, /* 5 */ }, + { + .name = "cpu/L1-dcache-load-miss/", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_genhw, + /* 6 */ + }, + { + .name = "cpu/L1-dcache-load-miss/kp", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_genhw_modifier, + /* 7 */ + }, + { + .name = "cpu/L1-dcache-misses,name=cachepmu/", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_config_cache, + /* 8 */ + }, }; struct terms_test { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index b5d95fce520c..f692dd953593 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -404,33 +404,27 @@ static int config_attr(struct perf_event_attr *attr, struct parse_events_error *err, config_term_func_t config_term); -int parse_events_add_cache(struct list_head *list, int *idx, const char *name, - struct parse_events_error *err, - struct list_head *head_config, - struct parse_events_state *parse_state) +/** + * parse_events__decode_legacy_cache - Search name for the legacy cache event + * name composed of 1, 2 or 3 hyphen + * separated sections. The first section is + * the cache type while the others are the + * optional op and optional result. To make + * life hard the names in the table also + * contain hyphens and the longest name + * should always be selected. + */ +static int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config) { - struct perf_event_attr attr; - LIST_HEAD(config_terms); - const char *config_name, *metric_id; - int cache_type = -1, cache_op = -1, cache_result = -1; - int ret, len; + int len, cache_type = -1, cache_op = -1, cache_result = -1; const char *name_end = &name[strlen(name) + 1]; - bool hybrid; const char *str = name; - /* - * Search str for the legacy cache event name composed of 1, 2 or 3 - * hyphen separated sections. The first section is the cache type while - * the others are the optional op and optional result. To make life hard - * the names in the table also contain hyphens and the longest name - * should always be selected. - */ cache_type = parse_aliases(str, evsel__hw_cache, PERF_COUNT_HW_CACHE_MAX, &len); if (cache_type == -1) return -EINVAL; str += len + 1; - config_name = get_config_name(head_config); if (str < name_end) { cache_op = parse_aliases(str, evsel__hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX, &len); @@ -471,9 +465,28 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, if (cache_result == -1) cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS; + *config = ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT) | + cache_type | (cache_op << 8) | (cache_result << 16); + return 0; +} + +int parse_events_add_cache(struct list_head *list, int *idx, const char *name, + struct parse_events_error *err, + struct list_head *head_config, + struct parse_events_state *parse_state) +{ + struct perf_event_attr attr; + LIST_HEAD(config_terms); + const char *config_name, *metric_id; + int ret; + bool hybrid; + + memset(&attr, 0, sizeof(attr)); - attr.config = cache_type | (cache_op << 8) | (cache_result << 16); attr.type = PERF_TYPE_HW_CACHE; + ret = parse_events__decode_legacy_cache(name, /*pmu_type=*/0, &attr.config); + if (ret) + return ret; if (head_config) { if (config_attr(&attr, head_config, err, @@ -484,6 +497,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, return -ENOMEM; } + config_name = get_config_name(head_config); metric_id = get_config_metric_id(head_config); ret = parse_events__add_cache_hybrid(list, idx, &attr, config_name ? : name, @@ -1022,6 +1036,7 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = { [PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE] = "aux-sample-size", [PARSE_EVENTS__TERM_TYPE_METRIC_ID] = "metric-id", [PARSE_EVENTS__TERM_TYPE_RAW] = "raw", + [PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE] = "legacy-cache", }; static bool config_term_shrinked; @@ -1199,15 +1214,25 @@ static int config_term_pmu(struct perf_event_attr *attr, struct parse_events_term *term, struct parse_events_error *err) { + if (term->type_term == PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE) { + const struct perf_pmu *pmu = perf_pmu__find_by_type(attr->type); + + if (perf_pmu__supports_legacy_cache(pmu)) { + attr->type = PERF_TYPE_HW_CACHE; + return parse_events__decode_legacy_cache(term->config, pmu->type, + &attr->config); + } else + term->type_term = PARSE_EVENTS__TERM_TYPE_USER; + } if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER || - term->type_term == PARSE_EVENTS__TERM_TYPE_DRV_CFG) + term->type_term == PARSE_EVENTS__TERM_TYPE_DRV_CFG) { /* * Always succeed for sysfs terms, as we dont know * at this point what type they need to have. */ return 0; - else - return config_term_common(attr, term, err); + } + return config_term_common(attr, term, err); } #ifdef HAVE_LIBTRACEEVENT @@ -2147,6 +2172,7 @@ int __parse_events(struct evlist *evlist, const char *str, .evlist = evlist, .stoken = PE_START_EVENTS, .fake_pmu = fake_pmu, + .match_legacy_cache_terms = true, }; int ret; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index f638542c8638..5acb62c2e00a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -71,6 +71,7 @@ enum { PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE, PARSE_EVENTS__TERM_TYPE_METRIC_ID, PARSE_EVENTS__TERM_TYPE_RAW, + PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE, __PARSE_EVENTS__TERM_TYPE_NR, }; @@ -122,6 +123,8 @@ struct parse_events_state { int stoken; struct perf_pmu *fake_pmu; char *hybrid_pmu_name; + /* Should PE_LEGACY_NAME tokens be generated for config terms? */ + bool match_legacy_cache_terms; bool wild_card_pmus; }; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 4b35c099189a..abe0ce681d29 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -63,6 +63,11 @@ static int str(yyscan_t scanner, int token) return token; } +static int lc_str(yyscan_t scanner, const struct parse_events_state *state) +{ + return str(scanner, state->match_legacy_cache_terms ? PE_LEGACY_CACHE : PE_NAME); +} + static bool isbpf_suffix(char *text) { int len = strlen(text); @@ -185,7 +190,6 @@ lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative %{ struct parse_events_state *_parse_state = parse_events_get_extra(yyscanner); - { int start_token = _parse_state->stoken; @@ -269,6 +273,9 @@ r{num_raw_hex} { return str(yyscanner, PE_RAW); } r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } +{lc_type} { return lc_str(yyscanner, _parse_state); } +{lc_type}-{lc_op_result} { return lc_str(yyscanner, _parse_state); } +{lc_type}-{lc_op_result}-{lc_op_result} { return lc_str(yyscanner, _parse_state); } {name_minus} { return str(yyscanner, PE_NAME); } \[all\] { return PE_ARRAY_ALL; } "[" { BEGIN(array); return '['; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index e7072b5601c5..f84fa1b132b3 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -723,7 +723,7 @@ event_term $$ = head; } -name_or_raw: PE_RAW | PE_NAME +name_or_raw: PE_RAW | PE_NAME | PE_LEGACY_CACHE event_term: PE_RAW @@ -775,6 +775,18 @@ name_or_raw '=' PE_VALUE_SYM_HW $$ = term; } | +PE_LEGACY_CACHE +{ + struct parse_events_term *term; + + if (parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE, + $1, 1, true, &@1, NULL)) { + free($1); + YYABORT; + } + $$ = term; +} +| PE_NAME { struct parse_events_term *term; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index cb33d869f1ed..63071d876190 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1650,6 +1650,11 @@ bool is_pmu_core(const char *name) return !strcmp(name, "cpu") || is_arm_pmu_core(name); } +bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) +{ + return is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); +} + static bool pmu_alias_is_duplicate(struct sevent *alias_a, struct sevent *alias_b) { diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index b9a02dedd473..05702bc4bcf8 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -220,6 +220,7 @@ void perf_pmu__del_formats(struct list_head *formats); struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu); bool is_pmu_core(const char *name); +bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); void print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool pmu_have_event(const char *pname, const char *name); -- cgit v1.2.3 From 2bdf4d7ea9b66e54948297194d564a71504a5bda Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:31 -0700 Subject: perf parse-events: Wildcard legacy cache events It is inconsistent that "perf stat -e instructions-retired" wildcard opens on all PMUs while legacy cache events like "perf stat -e L1-dcache-load-miss" do not. A behavior introduced by hybrid is that a legacy cache event like L1-dcache-load-miss should wildcard open on all hybrid PMUs. Previously hybrid would call to is_event_supported for each PMU, a failure of which results in the event not being added. This isn't done in this case as the parser should just create perf_event_attr and the later open should fail, or the counter give "". If this wants to be avoided then the PMU can be named with the event. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-25-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-hybrid.c | 33 ----------------- tools/perf/util/parse-events-hybrid.h | 7 ---- tools/perf/util/parse-events.c | 68 ++++++++++++++++++----------------- tools/perf/util/parse-events.h | 3 +- tools/perf/util/parse-events.y | 2 +- 5 files changed, 37 insertions(+), 76 deletions(-) diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c index 7c9f9150bad5..d2c0be051d46 100644 --- a/tools/perf/util/parse-events-hybrid.c +++ b/tools/perf/util/parse-events-hybrid.c @@ -179,36 +179,3 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, return add_raw_hybrid(parse_state, list, attr, name, metric_id, config_terms); } - -int parse_events__add_cache_hybrid(struct list_head *list, int *idx, - struct perf_event_attr *attr, - const char *name, - const char *metric_id, - struct list_head *config_terms, - bool *hybrid, - struct parse_events_state *parse_state) -{ - struct perf_pmu *pmu; - int ret; - - *hybrid = false; - if (!perf_pmu__has_hybrid()) - return 0; - - *hybrid = true; - perf_pmu__for_each_hybrid_pmu(pmu) { - LIST_HEAD(terms); - - if (pmu_cmp(parse_state, pmu)) - continue; - - copy_config_terms(&terms, config_terms); - ret = create_event_hybrid(PERF_TYPE_HW_CACHE, idx, list, - attr, name, metric_id, &terms, pmu); - free_config_terms(&terms); - if (ret) - return ret; - } - - return 0; -} diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h index cbc05fec02a2..bc2966e73897 100644 --- a/tools/perf/util/parse-events-hybrid.h +++ b/tools/perf/util/parse-events-hybrid.h @@ -15,11 +15,4 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, struct list_head *config_terms, bool *hybrid); -int parse_events__add_cache_hybrid(struct list_head *list, int *idx, - struct perf_event_attr *attr, - const char *name, const char *metric_id, - struct list_head *config_terms, - bool *hybrid, - struct parse_events_state *parse_state); - #endif /* __PERF_PARSE_EVENTS_HYBRID_H */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f692dd953593..9f2bbf8f3a81 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -472,46 +472,48 @@ static int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_error *err, - struct list_head *head_config, - struct parse_events_state *parse_state) + struct list_head *head_config) { - struct perf_event_attr attr; - LIST_HEAD(config_terms); - const char *config_name, *metric_id; - int ret; - bool hybrid; + struct perf_pmu *pmu = NULL; + bool found_supported = false; + const char *config_name = get_config_name(head_config); + const char *metric_id = get_config_metric_id(head_config); + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + LIST_HEAD(config_terms); + struct perf_event_attr attr; + int ret; - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HW_CACHE; - ret = parse_events__decode_legacy_cache(name, /*pmu_type=*/0, &attr.config); - if (ret) - return ret; + /* Skip unsupported PMUs. */ + if (!perf_pmu__supports_legacy_cache(pmu)) + continue; - if (head_config) { - if (config_attr(&attr, head_config, err, - config_term_common)) - return -EINVAL; + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HW_CACHE; - if (get_config_terms(head_config, &config_terms)) - return -ENOMEM; - } + ret = parse_events__decode_legacy_cache(name, pmu->type, &attr.config); + if (ret) + return ret; - config_name = get_config_name(head_config); - metric_id = get_config_metric_id(head_config); - ret = parse_events__add_cache_hybrid(list, idx, &attr, - config_name ? : name, - metric_id, - &config_terms, - &hybrid, parse_state); - if (hybrid) - goto out_free_terms; + found_supported = true; - ret = add_event(list, idx, &attr, config_name ? : name, metric_id, - &config_terms); -out_free_terms: - free_config_terms(&config_terms); - return ret; + if (head_config) { + if (config_attr(&attr, head_config, err, + config_term_common)) + return -EINVAL; + + if (get_config_terms(head_config, &config_terms)) + return -ENOMEM; + } + + if (__add_event(list, idx, &attr, /*init_attr*/true, config_name ?: name, + metric_id, pmu, &config_terms, /*auto_merge_stats=*/false, + /*cpu_list=*/NULL) == NULL) + return -ENOMEM; + + free_config_terms(&config_terms); + } + return found_supported ? 0 : -EINVAL; } #ifdef HAVE_LIBTRACEEVENT diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 5acb62c2e00a..0c26303f7f63 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -172,8 +172,7 @@ int parse_events_add_tool(struct parse_events_state *parse_state, int tool_event); int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_error *error, - struct list_head *head_config, - struct parse_events_state *parse_state); + struct list_head *head_config); int parse_events_add_breakpoint(struct list_head *list, int *idx, u64 addr, char *type, u64 len); int parse_events_add_pmu(struct parse_events_state *parse_state, diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index f84fa1b132b3..cc7528558845 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -476,7 +476,7 @@ PE_LEGACY_CACHE opt_event_config list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, error, $2, parse_state); + err = parse_events_add_cache(list, &parse_state->idx, $1, error, $2); parse_events_terms__delete($2); free($1); -- cgit v1.2.3 From d7f21df0c991f0909a992c0c7e2d31d4c46d40b4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:32 -0700 Subject: perf print-events: Print legacy cache events for each PMU Mirroring parse_events_add_cache, list the legacy name alongside its alias with the PMU. Remove the now unnecessary hybrid logic. Note, the alias output removes the event type descriptor, so: L1-dcache-loads [Hardware cache event] becomes: L1-dcache-loads OR cpu/L1-dcache-loads/ Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-26-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 2 +- tools/perf/util/parse-events.h | 1 + tools/perf/util/print-events.c | 88 ++++++++++++++++++++---------------------- 3 files changed, 43 insertions(+), 48 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9f2bbf8f3a81..ec72f11fb37f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -414,7 +414,7 @@ static int config_attr(struct perf_event_attr *attr, * contain hyphens and the longest name * should always be selected. */ -static int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config) +int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config) { int len, cache_type = -1, cache_op = -1, cache_result = -1; const char *name_end = &name[strlen(name) + 1]; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 0c26303f7f63..4e49be290209 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -173,6 +173,7 @@ int parse_events_add_tool(struct parse_events_state *parse_state, int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_error *error, struct list_head *head_config); +int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config); int parse_events_add_breakpoint(struct list_head *list, int *idx, u64 addr, char *type, u64 len); int parse_events_add_pmu(struct parse_events_state *parse_state, diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 89ac34a922c9..d148842b205a 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -230,56 +230,50 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state) int print_hwcache_events(const struct print_callbacks *print_cb, void *print_state) { + struct perf_pmu *pmu = NULL; const char *event_type_descriptor = event_type_descriptors[PERF_TYPE_HW_CACHE]; - for (int type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { - for (int op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { - /* skip invalid cache type */ - if (!evsel__is_cache_op_valid(type, op)) - continue; - - for (int res = 0; res < PERF_COUNT_HW_CACHE_RESULT_MAX; res++) { - struct perf_pmu *pmu = NULL; - char name[64]; - - __evsel__hw_cache_type_op_res_name(type, op, res, - name, sizeof(name)); - if (!perf_pmu__has_hybrid()) { - if (is_event_supported(PERF_TYPE_HW_CACHE, - type | (op << 8) | (res << 16))) { - print_cb->print_event(print_state, - "cache", - /*pmu_name=*/NULL, - name, - /*event_alias=*/NULL, - /*scale_unit=*/NULL, - /*deprecated=*/false, - event_type_descriptor, - /*desc=*/NULL, - /*long_desc=*/NULL, - /*encoding_desc=*/NULL); - } + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + /* + * Skip uncore PMUs for performance. PERF_TYPE_HW_CACHE type + * attributes can accept software PMUs in the extended type, so + * also skip. + */ + if (pmu->is_uncore || pmu->type == PERF_TYPE_SOFTWARE) + continue; + + for (int type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) { + for (int op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) { + /* skip invalid cache type */ + if (!evsel__is_cache_op_valid(type, op)) continue; - } - perf_pmu__for_each_hybrid_pmu(pmu) { - if (is_event_supported(PERF_TYPE_HW_CACHE, - type | (op << 8) | (res << 16) | - ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT))) { - char new_name[128]; - snprintf(new_name, sizeof(new_name), - "%s/%s/", pmu->name, name); - print_cb->print_event(print_state, - "cache", - pmu->name, - name, - new_name, - /*scale_unit=*/NULL, - /*deprecated=*/false, - event_type_descriptor, - /*desc=*/NULL, - /*long_desc=*/NULL, - /*encoding_desc=*/NULL); - } + + for (int res = 0; res < PERF_COUNT_HW_CACHE_RESULT_MAX; res++) { + char name[64]; + char alias_name[128]; + __u64 config; + int ret; + + __evsel__hw_cache_type_op_res_name(type, op, res, + name, sizeof(name)); + + ret = parse_events__decode_legacy_cache(name, pmu->type, + &config); + if (ret || !is_event_supported(PERF_TYPE_HW_CACHE, config)) + continue; + snprintf(alias_name, sizeof(alias_name), "%s/%s/", + pmu->name, name); + print_cb->print_event(print_state, + "cache", + pmu->name, + name, + alias_name, + /*scale_unit=*/NULL, + /*deprecated=*/false, + event_type_descriptor, + /*desc=*/NULL, + /*long_desc=*/NULL, + /*encoding_desc=*/NULL); } } } -- cgit v1.2.3 From 8bc75f699c14142021d9ecbf5556ded13a403b64 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:33 -0700 Subject: perf parse-events: Support wildcards on raw events Legacy raw events like r1a open as PERF_TYPE_RAW on non-hybrid systems and on each hybrid PMU on hybrid systems. Rather than iterate hybrid PMUs add a perf_pmu__supports_wildcard_numeric function that says when a numeric event should be opened upon it. If the parsed event specifies the type of the PMU then don't wildcard match PMUs, use the specified PMU type. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-27-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 50 +++++++++++++++++++++++++++++------------- tools/perf/util/parse-events.h | 3 ++- tools/perf/util/parse-events.y | 13 +++++++---- tools/perf/util/pmu.c | 5 +++++ tools/perf/util/pmu.h | 1 + 5 files changed, 52 insertions(+), 20 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index ec72f11fb37f..c8b4ec076825 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -25,7 +25,6 @@ #include "util/parse-branch-options.h" #include "util/evsel_config.h" #include "util/event.h" -#include "util/parse-events-hybrid.h" #include "util/pmu-hybrid.h" #include "util/bpf-filter.h" #include "util/util.h" @@ -1448,15 +1447,14 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, #endif } -int parse_events_add_numeric(struct parse_events_state *parse_state, - struct list_head *list, - u32 type, u64 config, - struct list_head *head_config) +static int __parse_events_add_numeric(struct parse_events_state *parse_state, + struct list_head *list, + struct perf_pmu *pmu, u32 type, u64 config, + struct list_head *head_config) { struct perf_event_attr attr; LIST_HEAD(config_terms); const char *name, *metric_id; - bool hybrid; int ret; memset(&attr, 0, sizeof(attr)); @@ -1474,19 +1472,41 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, name = get_config_name(head_config); metric_id = get_config_metric_id(head_config); - ret = parse_events__add_numeric_hybrid(parse_state, list, &attr, - name, metric_id, - &config_terms, &hybrid); - if (hybrid) - goto out_free_terms; - - ret = add_event(list, &parse_state->idx, &attr, name, metric_id, - &config_terms); -out_free_terms: + ret = __add_event(list, &parse_state->idx, &attr, /*init_attr*/true, name, + metric_id, pmu, &config_terms, /*auto_merge_stats=*/false, + /*cpu_list=*/NULL) ? 0 : -ENOMEM; free_config_terms(&config_terms); return ret; } +int parse_events_add_numeric(struct parse_events_state *parse_state, + struct list_head *list, + u32 type, u64 config, + struct list_head *head_config, + bool wildcard) +{ + struct perf_pmu *pmu = NULL; + bool found_supported = false; + + if (!wildcard) + return __parse_events_add_numeric(parse_state, list, /*pmu=*/NULL, + type, config, head_config); + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + int ret; + + if (!perf_pmu__supports_wildcard_numeric(pmu)) + continue; + + found_supported = true; + ret = __parse_events_add_numeric(parse_state, list, pmu, pmu->type, + config, head_config); + if (ret) + return ret; + } + return found_supported ? 0 : -EINVAL; +} + int parse_events_add_tool(struct parse_events_state *parse_state, struct list_head *list, int tool_event) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 4e49be290209..831cd1ff4702 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -166,7 +166,8 @@ int parse_events_load_bpf_obj(struct parse_events_state *parse_state, int parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, u32 type, u64 config, - struct list_head *head_config); + struct list_head *head_config, + bool wildcard); int parse_events_add_tool(struct parse_events_state *parse_state, struct list_head *list, int tool_event); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index cc7528558845..5055a29a448f 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -435,7 +435,8 @@ value_sym '/' event_config '/' list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_numeric(_parse_state, list, type, config, $3); + err = parse_events_add_numeric(_parse_state, list, type, config, $3, + /*wildcard=*/false); parse_events_terms__delete($3); if (err) { free_list_evsel(list); @@ -452,7 +453,9 @@ value_sym sep_slash_slash_dc list = alloc_list(); ABORT_ON(!list); - ABORT_ON(parse_events_add_numeric(_parse_state, list, type, config, NULL)); + ABORT_ON(parse_events_add_numeric(_parse_state, list, type, config, + /*head_config=*/NULL, + /*wildcard=*/false)); $$ = list; } | @@ -596,7 +599,8 @@ PE_VALUE ':' PE_VALUE opt_event_config list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_numeric(_parse_state, list, (u32)$1, $3, $4); + err = parse_events_add_numeric(_parse_state, list, (u32)$1, $3, $4, + /*wildcard=*/false); parse_events_terms__delete($4); if (err) { free(list); @@ -618,7 +622,8 @@ PE_RAW opt_event_config num = strtoull($1 + 1, NULL, 16); ABORT_ON(errno); free($1); - err = parse_events_add_numeric(_parse_state, list, PERF_TYPE_RAW, num, $2); + err = parse_events_add_numeric(_parse_state, list, PERF_TYPE_RAW, num, $2, + /*wildcard=*/true); parse_events_terms__delete($2); if (err) { free(list); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 63071d876190..cd4247a379d4 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1655,6 +1655,11 @@ bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) return is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); } +bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu) +{ + return is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); +} + static bool pmu_alias_is_duplicate(struct sevent *alias_a, struct sevent *alias_b) { diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 05702bc4bcf8..5a19536a5449 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -221,6 +221,7 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu); bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); +bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu); void print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool pmu_have_event(const char *pname, const char *name); -- cgit v1.2.3 From 996e54bbee825d25706796672926dbac826a0818 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:34 -0700 Subject: perf parse-events: Remove now unused hybrid logic The event parser no longer needs to recurse in case of a legacy cache event in a PMU, the necessary wild card logic has moved to perf_pmu__supports_legacy_cache and perf_pmu__supports_wildcard_numeric. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-28-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 - tools/perf/util/parse-events-hybrid.c | 181 ---------------------------------- tools/perf/util/parse-events-hybrid.h | 18 ---- tools/perf/util/parse-events.c | 74 -------------- tools/perf/util/parse-events.h | 8 -- 5 files changed, 282 deletions(-) delete mode 100644 tools/perf/util/parse-events-hybrid.c delete mode 100644 tools/perf/util/parse-events-hybrid.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index bd18fe5f2719..c146736ead19 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -24,7 +24,6 @@ perf-y += llvm-utils.o perf-y += mmap.o perf-y += memswap.o perf-y += parse-events.o -perf-y += parse-events-hybrid.o perf-y += print-events.o perf-y += tracepoint.o perf-y += perf_regs.o diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c deleted file mode 100644 index d2c0be051d46..000000000000 --- a/tools/perf/util/parse-events-hybrid.c +++ /dev/null @@ -1,181 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include "evlist.h" -#include "evsel.h" -#include "parse-events.h" -#include "parse-events-hybrid.h" -#include "debug.h" -#include "pmu.h" -#include "pmu-hybrid.h" -#include "perf.h" - -static void config_hybrid_attr(struct perf_event_attr *attr, - int type, int pmu_type) -{ - /* - * attr.config layout for type PERF_TYPE_HARDWARE and - * PERF_TYPE_HW_CACHE - * - * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA - * AA: hardware event ID - * EEEEEEEE: PMU type ID - * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB - * BB: hardware cache ID - * CC: hardware cache op ID - * DD: hardware cache op result ID - * EEEEEEEE: PMU type ID - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. - */ - attr->type = type; - attr->config = (attr->config & PERF_HW_EVENT_MASK) | - ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT); -} - -static int create_event_hybrid(__u32 config_type, int *idx, - struct list_head *list, - struct perf_event_attr *attr, const char *name, - const char *metric_id, - struct list_head *config_terms, - struct perf_pmu *pmu) -{ - struct evsel *evsel; - __u32 type = attr->type; - __u64 config = attr->config; - - config_hybrid_attr(attr, config_type, pmu->type); - - /* - * Some hybrid hardware cache events are only available on one CPU - * PMU. For example, the 'L1-dcache-load-misses' is only available - * on cpu_core, while the 'L1-icache-loads' is only available on - * cpu_atom. We need to remove "not supported" hybrid cache events. - */ - if (attr->type == PERF_TYPE_HW_CACHE - && !is_event_supported(attr->type, attr->config)) - return 0; - - evsel = parse_events__add_event_hybrid(list, idx, attr, name, metric_id, - pmu, config_terms); - if (evsel) { - evsel->pmu_name = strdup(pmu->name); - if (!evsel->pmu_name) - return -ENOMEM; - } else - return -ENOMEM; - attr->type = type; - attr->config = config; - return 0; -} - -static int pmu_cmp(struct parse_events_state *parse_state, - struct perf_pmu *pmu) -{ - if (parse_state->evlist && parse_state->evlist->hybrid_pmu_name) - return strcmp(parse_state->evlist->hybrid_pmu_name, pmu->name); - - if (parse_state->hybrid_pmu_name) - return strcmp(parse_state->hybrid_pmu_name, pmu->name); - - return 0; -} - -static int add_hw_hybrid(struct parse_events_state *parse_state, - struct list_head *list, struct perf_event_attr *attr, - const char *name, const char *metric_id, - struct list_head *config_terms) -{ - struct perf_pmu *pmu; - int ret; - - perf_pmu__for_each_hybrid_pmu(pmu) { - LIST_HEAD(terms); - - if (pmu_cmp(parse_state, pmu)) - continue; - - copy_config_terms(&terms, config_terms); - ret = create_event_hybrid(PERF_TYPE_HARDWARE, - &parse_state->idx, list, attr, name, - metric_id, &terms, pmu); - free_config_terms(&terms); - if (ret) - return ret; - } - - return 0; -} - -static int create_raw_event_hybrid(int *idx, struct list_head *list, - struct perf_event_attr *attr, - const char *name, - const char *metric_id, - struct list_head *config_terms, - struct perf_pmu *pmu) -{ - struct evsel *evsel; - - attr->type = pmu->type; - evsel = parse_events__add_event_hybrid(list, idx, attr, name, metric_id, - pmu, config_terms); - if (evsel) - evsel->pmu_name = strdup(pmu->name); - else - return -ENOMEM; - - return 0; -} - -static int add_raw_hybrid(struct parse_events_state *parse_state, - struct list_head *list, struct perf_event_attr *attr, - const char *name, const char *metric_id, - struct list_head *config_terms) -{ - struct perf_pmu *pmu; - int ret; - - perf_pmu__for_each_hybrid_pmu(pmu) { - LIST_HEAD(terms); - - if (pmu_cmp(parse_state, pmu)) - continue; - - copy_config_terms(&terms, config_terms); - ret = create_raw_event_hybrid(&parse_state->idx, list, attr, - name, metric_id, &terms, pmu); - free_config_terms(&terms); - if (ret) - return ret; - } - - return 0; -} - -int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, - struct list_head *list, - struct perf_event_attr *attr, - const char *name, const char *metric_id, - struct list_head *config_terms, - bool *hybrid) -{ - *hybrid = false; - if (attr->type == PERF_TYPE_SOFTWARE) - return 0; - - if (!perf_pmu__has_hybrid()) - return 0; - - *hybrid = true; - if (attr->type != PERF_TYPE_RAW) { - return add_hw_hybrid(parse_state, list, attr, name, metric_id, - config_terms); - } - - return add_raw_hybrid(parse_state, list, attr, name, metric_id, - config_terms); -} diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h deleted file mode 100644 index bc2966e73897..000000000000 --- a/tools/perf/util/parse-events-hybrid.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PERF_PARSE_EVENTS_HYBRID_H -#define __PERF_PARSE_EVENTS_HYBRID_H - -#include -#include -#include -#include -#include - -int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, - struct list_head *list, - struct perf_event_attr *attr, - const char *name, const char *metric_id, - struct list_head *config_terms, - bool *hybrid); - -#endif /* __PERF_PARSE_EVENTS_HYBRID_H */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c8b4ec076825..1d8c3cf9c185 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -25,7 +25,6 @@ #include "util/parse-branch-options.h" #include "util/evsel_config.h" #include "util/event.h" -#include "util/pmu-hybrid.h" #include "util/bpf-filter.h" #include "util/util.h" #include "tracepoint.h" @@ -39,9 +38,6 @@ extern int parse_events_debug; int parse_events_parse(void *parse_state, void *scanner); static int get_config_terms(struct list_head *head_config, struct list_head *head_terms __maybe_unused); -static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state, - const char *str, char *pmu_name, - struct list_head *list); struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = { @@ -1526,33 +1522,6 @@ static bool config_term_percore(struct list_head *config_terms) return false; } -static int parse_events__inside_hybrid_pmu(struct parse_events_state *parse_state, - struct list_head *list, char *name, - struct list_head *head_config) -{ - struct parse_events_term *term; - int ret = -1; - - if (parse_state->fake_pmu || !head_config || list_empty(head_config) || - !perf_pmu__is_hybrid(name)) { - return -1; - } - - /* - * More than one term in list. - */ - if (head_config->next && head_config->next->next != head_config) - return -1; - - term = list_first_entry(head_config, struct parse_events_term, list); - if (term && term->config && strcmp(term->config, "event")) { - ret = parse_events__with_hybrid_pmu(parse_state, term->config, - name, list); - } - - return ret; -} - int parse_events_add_pmu(struct parse_events_state *parse_state, struct list_head *list, char *name, struct list_head *head_config, @@ -1642,11 +1611,6 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (pmu->default_config && get_config_chgs(pmu, head_config, &config_terms)) return -ENOMEM; - if (!parse_events__inside_hybrid_pmu(parse_state, list, name, - head_config)) { - return 0; - } - if (!parse_state->fake_pmu && perf_pmu__config(pmu, &attr, head_config, parse_state->error)) { free_config_terms(&config_terms); return -EINVAL; @@ -2023,32 +1987,6 @@ int parse_events_terms(struct list_head *terms, const char *str) return ret; } -static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state, - const char *str, char *pmu_name, - struct list_head *list) -{ - struct parse_events_state ps = { - .list = LIST_HEAD_INIT(ps.list), - .stoken = PE_START_EVENTS, - .hybrid_pmu_name = pmu_name, - .idx = parse_state->idx, - }; - int ret; - - ret = parse_events__scanner(str, &ps); - - if (!ret) { - if (!list_empty(&ps.list)) { - list_splice(&ps.list, list); - parse_state->idx = ps.idx; - return 0; - } else - return -1; - } - - return ret; -} - __weak int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs) { /* Order by insertion index. */ @@ -2779,15 +2717,3 @@ char *parse_events_formats_error_string(char *additional_terms) fail: return NULL; } - -struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx, - struct perf_event_attr *attr, - const char *name, - const char *metric_id, - struct perf_pmu *pmu, - struct list_head *config_terms) -{ - return __add_event(list, idx, attr, /*init_attr=*/true, name, metric_id, - pmu, config_terms, /*auto_merge_stats=*/false, - /*cpu_list=*/NULL); -} diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 831cd1ff4702..77b8f7efdb94 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -122,7 +122,6 @@ struct parse_events_state { struct list_head *terms; int stoken; struct perf_pmu *fake_pmu; - char *hybrid_pmu_name; /* Should PE_LEGACY_NAME tokens be generated for config terms? */ bool match_legacy_cache_terms; bool wild_card_pmus; @@ -235,11 +234,4 @@ static inline bool is_sdt_event(char *str __maybe_unused) } #endif /* HAVE_LIBELF_SUPPORT */ -struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx, - struct perf_event_attr *attr, - const char *name, - const char *metric_id, - struct perf_pmu *pmu, - struct list_head *config_terms); - #endif /* __PERF_PARSE_EVENTS_H */ -- cgit v1.2.3 From 24d80818ce2216ee37308b3b087944efb2479b80 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:35 -0700 Subject: perf parse-events: Minor type safety cleanup Use the typed parse_state rather than void* _parse_state when available. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-29-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.y | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 5055a29a448f..e709508b1d6e 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -301,7 +301,7 @@ PE_NAME opt_pmu_config if (!list) CLEANUP_YYABORT; /* Attempt to add to list assuming $1 is a PMU name. */ - if (parse_events_add_pmu(_parse_state, list, $1, $2, /*auto_merge_stats=*/false)) { + if (parse_events_add_pmu(parse_state, list, $1, $2, /*auto_merge_stats=*/false)) { struct perf_pmu *pmu = NULL; int ok = 0; @@ -319,7 +319,7 @@ PE_NAME opt_pmu_config !perf_pmu__match(pattern, pmu->alias_name, $1)) { if (parse_events_copy_term_list(orig_terms, &terms)) CLEANUP_YYABORT; - if (!parse_events_add_pmu(_parse_state, list, pmu->name, terms, + if (!parse_events_add_pmu(parse_state, list, pmu->name, terms, /*auto_merge_stats=*/true)) { ok++; parse_state->wild_card_pmus = true; @@ -331,7 +331,7 @@ PE_NAME opt_pmu_config if (!ok) { /* Failure to add, assume $1 is an event name. */ zfree(&list); - ok = !parse_events_multi_pmu_add(_parse_state, $1, $2, &list); + ok = !parse_events_multi_pmu_add(parse_state, $1, $2, &list); $2 = NULL; } if (!ok) -- cgit v1.2.3 From 411ad22ecf0281d666a82aa7f4de90c70365da7d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:36 -0700 Subject: perf parse-events: Add pmu filter To support the cputype argument added to "perf stat" for hybrid it is necessary to filter events during wildcard matching. Add a scanner argument for the filter and checking it when wildcard matching. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-30-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 13 +++++++++-- tools/perf/builtin-stat.c | 10 +++++--- tools/perf/builtin-top.c | 5 +++- tools/perf/builtin-trace.c | 5 +++- tools/perf/tests/parse-events.c | 3 ++- tools/perf/tests/pmu-events.c | 3 ++- tools/perf/util/evlist.h | 1 - tools/perf/util/metricgroup.c | 4 ++-- tools/perf/util/parse-events.c | 51 ++++++++++++++++++++++++++++++----------- tools/perf/util/parse-events.h | 21 +++++++++++++---- tools/perf/util/parse-events.y | 6 +++-- 11 files changed, 90 insertions(+), 32 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index efa03e4ac2c9..ec0f2d5f189f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -3335,6 +3335,14 @@ const char record_callchain_help[] = CALLCHAIN_RECORD_HELP static bool dry_run; +static struct parse_events_option_args parse_events_option_args = { + .evlistp = &record.evlist, +}; + +static struct parse_events_option_args switch_output_parse_events_option_args = { + .evlistp = &record.sb_evlist, +}; + /* * XXX Will stay a global variable till we fix builtin-script.c to stop messing * with it and switch to use the library functions in perf_evlist that came @@ -3343,7 +3351,7 @@ static bool dry_run; * using pipes, etc. */ static struct option __record_options[] = { - OPT_CALLBACK('e', "event", &record.evlist, "event", + OPT_CALLBACK('e', "event", &parse_events_option_args, "event", "event selector. use 'perf list' to list available events", parse_events_option), OPT_CALLBACK(0, "filter", &record.evlist, "filter", @@ -3496,7 +3504,8 @@ static struct option __record_options[] = { &record.switch_output.set, "signal or size[BKMG] or time[smhd]", "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold", "signal"), - OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event", + OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args, + &record.switch_output_event_set, "switch output event", "switch output event selector. use 'perf list' to list available events", parse_events_option_new_evlist), OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files, diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b9ad32f21e57..de2e915427c9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -101,6 +101,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv); static struct evlist *evsel_list; +static struct parse_events_option_args parse_events_option_args = { + .evlistp = &evsel_list, +}; + static bool all_counters_use_bpf = true; static struct target target = { @@ -1096,8 +1100,8 @@ static int parse_hybrid_type(const struct option *opt, return -1; } - evlist->hybrid_pmu_name = perf_pmu__hybrid_type_to_pmu(str); - if (!evlist->hybrid_pmu_name) { + parse_events_option_args.pmu_filter = perf_pmu__hybrid_type_to_pmu(str); + if (!parse_events_option_args.pmu_filter) { fprintf(stderr, "--cputype %s is not supported!\n", str); return -1; } @@ -1108,7 +1112,7 @@ static int parse_hybrid_type(const struct option *opt, static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), - OPT_CALLBACK('e', "event", &evsel_list, "event", + OPT_CALLBACK('e', "event", &parse_events_option_args, "event", "event selector. use 'perf list' to list available events", parse_events_option), OPT_CALLBACK(0, "filter", &evsel_list, "filter", diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index eb5740154bc0..48ee49e95c5e 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1440,12 +1440,15 @@ int cmd_top(int argc, const char **argv) .max_stack = sysctl__max_stack(), .nr_threads_synthesize = UINT_MAX, }; + struct parse_events_option_args parse_events_option_args = { + .evlistp = &top.evlist, + }; bool branch_call_mode = false; struct record_opts *opts = &top.record_opts; struct target *target = &opts->target; const char *disassembler_style = NULL, *objdump_path = NULL, *addr2line_path = NULL; const struct option options[] = { - OPT_CALLBACK('e', "event", &top.evlist, "event", + OPT_CALLBACK('e', "event", &parse_events_option_args, "event", "event selector. use 'perf list' to list available events", parse_events_option), OPT_U64('c', "count", &opts->user_interval, "event period to sample"), diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8ee3a45c3c54..b49d3abb1203 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4591,8 +4591,11 @@ do_concat: err = 0; if (lists[0]) { + struct parse_events_option_args parse_events_option_args = { + .evlistp = &trace->evlist, + }; struct option o = { - .value = &trace->evlist, + .value = &parse_events_option_args, }; err = parse_events_option(&o, lists[0], 0); } diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index c3afd0b129bb..0d0c869d2d09 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1952,7 +1952,8 @@ static int test_event_fake_pmu(const char *str) return -ENOMEM; parse_events_error__init(&err); - ret = __parse_events(evlist, str, &err, &perf_pmu__fake, /*warn_if_reordered=*/true); + ret = __parse_events(evlist, str, /*pmu_filter=*/NULL, &err, + &perf_pmu__fake, /*warn_if_reordered=*/true); if (ret) { pr_debug("failed to parse event '%s', err %d, str '%s'\n", str, ret, err.str); diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index a2cde61b1c77..734004f1a37d 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -776,7 +776,8 @@ static int check_parse_id(const char *id, struct parse_events_error *error, for (cur = strchr(dup, '@') ; cur; cur = strchr(++cur, '@')) *cur = '/'; - ret = __parse_events(evlist, dup, error, fake_pmu, /*warn_if_reordered=*/true); + ret = __parse_events(evlist, dup, /*pmu_filter=*/NULL, error, fake_pmu, + /*warn_if_reordered=*/true); free(dup); evlist__delete(evlist); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 46cf402add93..e7e5540cc970 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -67,7 +67,6 @@ struct evlist { struct evsel *selected; struct events_stats stats; struct perf_env *env; - const char *hybrid_pmu_name; void (*trace_event_sample_raw)(struct evlist *evlist, union perf_event *event, struct perf_sample *sample); diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 4e7d41d285b4..908124dab122 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1441,8 +1441,8 @@ static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu, } pr_debug("Parsing metric events '%s'\n", events.buf); parse_events_error__init(&parse_error); - ret = __parse_events(parsed_evlist, events.buf, &parse_error, fake_pmu, - /*warn_if_reordered=*/false); + ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL, + &parse_error, fake_pmu, /*warn_if_reordered=*/false); if (ret) { parse_events_error__print(&parse_error, events.buf); goto err_out; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1d8c3cf9c185..d9d964bbc0e2 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -465,8 +465,24 @@ int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *con return 0; } +/** + * parse_events__filter_pmu - returns false if a wildcard PMU should be + * considered, true if it should be filtered. + */ +bool parse_events__filter_pmu(const struct parse_events_state *parse_state, + const struct perf_pmu *pmu) +{ + if (parse_state->pmu_filter == NULL) + return false; + + if (pmu->name == NULL) + return true; + + return strcmp(parse_state->pmu_filter, pmu->name) != 0; +} + int parse_events_add_cache(struct list_head *list, int *idx, const char *name, - struct parse_events_error *err, + struct parse_events_state *parse_state, struct list_head *head_config) { struct perf_pmu *pmu = NULL; @@ -483,6 +499,9 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, if (!perf_pmu__supports_legacy_cache(pmu)) continue; + if (parse_events__filter_pmu(parse_state, pmu)) + continue; + memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_HW_CACHE; @@ -493,8 +512,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, found_supported = true; if (head_config) { - if (config_attr(&attr, head_config, err, - config_term_common)) + if (config_attr(&attr, head_config, parse_state->error, config_term_common)) return -EINVAL; if (get_config_terms(head_config, &config_terms)) @@ -1494,6 +1512,9 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, if (!perf_pmu__supports_wildcard_numeric(pmu)) continue; + if (parse_events__filter_pmu(parse_state, pmu)) + continue; + found_supported = true; ret = __parse_events_add_numeric(parse_state, list, pmu, pmu->type, config, head_config); @@ -1682,6 +1703,9 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, while ((pmu = perf_pmu__scan(pmu)) != NULL) { struct perf_pmu_alias *alias; + if (parse_events__filter_pmu(parse_state, pmu)) + continue; + list_for_each_entry(alias, &pmu->aliases, list) { if (!strcasecmp(alias->name, str)) { parse_events_copy_term_list(head, &orig_head); @@ -2121,7 +2145,7 @@ static bool parse_events__sort_events_and_fix_groups(struct list_head *list) return idx_changed || num_leaders != orig_num_leaders; } -int __parse_events(struct evlist *evlist, const char *str, +int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, struct parse_events_error *err, struct perf_pmu *fake_pmu, bool warn_if_reordered) { @@ -2132,6 +2156,7 @@ int __parse_events(struct evlist *evlist, const char *str, .evlist = evlist, .stoken = PE_START_EVENTS, .fake_pmu = fake_pmu, + .pmu_filter = pmu_filter, .match_legacy_cache_terms = true, }; int ret; @@ -2313,12 +2338,13 @@ void parse_events_error__print(struct parse_events_error *err, int parse_events_option(const struct option *opt, const char *str, int unset __maybe_unused) { - struct evlist *evlist = *(struct evlist **)opt->value; + struct parse_events_option_args *args = opt->value; struct parse_events_error err; int ret; parse_events_error__init(&err); - ret = parse_events(evlist, str, &err); + ret = __parse_events(*args->evlistp, str, args->pmu_filter, &err, + /*fake_pmu=*/NULL, /*warn_if_reordered=*/true); if (ret) { parse_events_error__print(&err, str); @@ -2331,22 +2357,21 @@ int parse_events_option(const struct option *opt, const char *str, int parse_events_option_new_evlist(const struct option *opt, const char *str, int unset) { - struct evlist **evlistp = opt->value; + struct parse_events_option_args *args = opt->value; int ret; - if (*evlistp == NULL) { - *evlistp = evlist__new(); + if (*args->evlistp == NULL) { + *args->evlistp = evlist__new(); - if (*evlistp == NULL) { + if (*args->evlistp == NULL) { fprintf(stderr, "Not enough memory to create evlist\n"); return -1; } } - ret = parse_events_option(opt, str, unset); if (ret) { - evlist__delete(*evlistp); - *evlistp = NULL; + evlist__delete(*args->evlistp); + *args->evlistp = NULL; } return ret; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 77b8f7efdb94..d4cbda6e946a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -22,17 +22,24 @@ bool is_event_supported(u8 type, u64 config); const char *event_type(int type); +/* Arguments encoded in opt->value. */ +struct parse_events_option_args { + struct evlist **evlistp; + const char *pmu_filter; +}; int parse_events_option(const struct option *opt, const char *str, int unset); int parse_events_option_new_evlist(const struct option *opt, const char *str, int unset); -__attribute__((nonnull(1, 2, 3))) -int __parse_events(struct evlist *evlist, const char *str, struct parse_events_error *error, - struct perf_pmu *fake_pmu, bool warn_if_reordered); +__attribute__((nonnull(1, 2, 4))) +int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, + struct parse_events_error *error, struct perf_pmu *fake_pmu, + bool warn_if_reordered); __attribute__((nonnull(1, 2, 3))) static inline int parse_events(struct evlist *evlist, const char *str, struct parse_events_error *err) { - return __parse_events(evlist, str, err, /*fake_pmu=*/NULL, /*warn_if_reordered=*/true); + return __parse_events(evlist, str, /*pmu_filter=*/NULL, err, /*fake_pmu=*/NULL, + /*warn_if_reordered=*/true); } int parse_event(struct evlist *evlist, const char *str); @@ -122,11 +129,15 @@ struct parse_events_state { struct list_head *terms; int stoken; struct perf_pmu *fake_pmu; + /* If non-null, when wildcard matching only match the given PMU. */ + const char *pmu_filter; /* Should PE_LEGACY_NAME tokens be generated for config terms? */ bool match_legacy_cache_terms; bool wild_card_pmus; }; +bool parse_events__filter_pmu(const struct parse_events_state *parse_state, + const struct perf_pmu *pmu); void parse_events__shrink_config_terms(void); int parse_events__is_hardcoded_term(struct parse_events_term *term); int parse_events_term__num(struct parse_events_term **term, @@ -171,7 +182,7 @@ int parse_events_add_tool(struct parse_events_state *parse_state, struct list_head *list, int tool_event); int parse_events_add_cache(struct list_head *list, int *idx, const char *name, - struct parse_events_error *error, + struct parse_events_state *parse_state, struct list_head *head_config); int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config); int parse_events_add_breakpoint(struct list_head *list, int *idx, diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index e709508b1d6e..c95877cbd6cf 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -312,6 +312,9 @@ PE_NAME opt_pmu_config while ((pmu = perf_pmu__scan(pmu)) != NULL) { char *name = pmu->name; + if (parse_events__filter_pmu(parse_state, pmu)) + continue; + if (!strncmp(name, "uncore_", 7) && strncmp($1, "uncore_", 7)) name += 7; @@ -473,13 +476,12 @@ event_legacy_cache: PE_LEGACY_CACHE opt_event_config { struct parse_events_state *parse_state = _parse_state; - struct parse_events_error *error = parse_state->error; struct list_head *list; int err; list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, error, $2); + err = parse_events_add_cache(list, &parse_state->idx, $1, parse_state, $2); parse_events_terms__delete($2); free($1); -- cgit v1.2.3 From 003be8c4f71753092bbb86fa9d7ad26dd9fb98db Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:37 -0700 Subject: perf stat: Make cputype filter generic Rather than limit the --cputype argument for "perf list" and "perf stat" to hybrid PMUs of just cpu_atom and cpu_core, allow any PMU. Note, that if cpu_atom isn't mounted but a filter of cpu_atom is requested, then this will now fail. As such a filter would never succeed, no events can come from that unmounted PMU, then this behavior could never have been useful and failing is clearer. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-31-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 21 +++++++++++++-------- tools/perf/builtin-stat.c | 12 +++++++----- tools/perf/util/pmu-hybrid.c | 20 -------------------- tools/perf/util/pmu-hybrid.h | 1 - tools/perf/util/pmus.c | 25 ++++++++++++++++++++++++- tools/perf/util/pmus.h | 3 +++ 6 files changed, 47 insertions(+), 35 deletions(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 1f5dbd5f0ba4..c6bd0aa4a56e 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -11,8 +11,8 @@ #include "builtin.h" #include "util/print-events.h" +#include "util/pmus.h" #include "util/pmu.h" -#include "util/pmu-hybrid.h" #include "util/debug.h" #include "util/metricgroup.h" #include "util/string2.h" @@ -429,7 +429,7 @@ int cmd_list(int argc, const char **argv) .print_event = default_print_event, .print_metric = default_print_metric, }; - const char *hybrid_name = NULL; + const char *cputype = NULL; const char *unit_name = NULL; bool json = false; struct option list_options[] = { @@ -443,8 +443,8 @@ int cmd_list(int argc, const char **argv) "Print information on the perf event names and expressions used internally by events."), OPT_BOOLEAN(0, "deprecated", &default_ps.deprecated, "Print deprecated events."), - OPT_STRING(0, "cputype", &hybrid_name, "hybrid cpu type", - "Limit PMU or metric printing to the given hybrid PMU (e.g. core or atom)."), + OPT_STRING(0, "cputype", &cputype, "cpu type", + "Limit PMU or metric printing to the given PMU (e.g. cpu, core or atom)."), OPT_STRING(0, "unit", &unit_name, "PMU name", "Limit PMU or metric printing to the specified PMU."), OPT_INCR(0, "debug", &verbose, @@ -484,10 +484,15 @@ int cmd_list(int argc, const char **argv) assert(default_ps.visited_metrics); if (unit_name) default_ps.pmu_glob = strdup(unit_name); - else if (hybrid_name) { - default_ps.pmu_glob = perf_pmu__hybrid_type_to_pmu(hybrid_name); - if (!default_ps.pmu_glob) - pr_warning("WARNING: hybrid cputype is not supported!\n"); + else if (cputype) { + const struct perf_pmu *pmu = perf_pmus__pmu_for_pmu_filter(cputype); + + if (!pmu) { + pr_err("ERROR: cputype is not supported!\n"); + ret = -1; + goto out; + } + default_ps.pmu_glob = pmu->name; } } print_cb.print_start(ps); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index de2e915427c9..fe9c6fa3f14a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -44,6 +44,7 @@ #include "util/cgroup.h" #include #include "util/parse-events.h" +#include "util/pmus.h" #include "util/pmu.h" #include "util/event.h" #include "util/evlist.h" @@ -69,7 +70,6 @@ #include "util/pfm.h" #include "util/bpf_counter.h" #include "util/iostat.h" -#include "util/pmu-hybrid.h" #include "util/util.h" #include "asm/bug.h" @@ -1089,10 +1089,11 @@ static int parse_stat_cgroups(const struct option *opt, return parse_cgroups(opt, str, unset); } -static int parse_hybrid_type(const struct option *opt, +static int parse_cputype(const struct option *opt, const char *str, int unset __maybe_unused) { + const struct perf_pmu *pmu; struct evlist *evlist = *(struct evlist **)opt->value; if (!list_empty(&evlist->core.entries)) { @@ -1100,11 +1101,12 @@ static int parse_hybrid_type(const struct option *opt, return -1; } - parse_events_option_args.pmu_filter = perf_pmu__hybrid_type_to_pmu(str); - if (!parse_events_option_args.pmu_filter) { + pmu = perf_pmus__pmu_for_pmu_filter(str); + if (!pmu) { fprintf(stderr, "--cputype %s is not supported!\n", str); return -1; } + parse_events_option_args.pmu_filter = pmu->name; return 0; } @@ -1230,7 +1232,7 @@ static struct option stat_options[] = { OPT_CALLBACK(0, "cputype", &evsel_list, "hybrid cpu type", "Only enable events on applying cpu with this type " "for hybrid platform (e.g. core or atom)", - parse_hybrid_type), + parse_cputype), #ifdef HAVE_LIBPFM OPT_CALLBACK(0, "pfm-events", &evsel_list, "event", "libpfm4 event selector. use 'perf list' to list available events", diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c index 38628805a952..bc4cb0738c35 100644 --- a/tools/perf/util/pmu-hybrid.c +++ b/tools/perf/util/pmu-hybrid.c @@ -50,23 +50,3 @@ bool perf_pmu__is_hybrid(const char *name) { return perf_pmu__find_hybrid_pmu(name) != NULL; } - -char *perf_pmu__hybrid_type_to_pmu(const char *type) -{ - char *pmu_name = NULL; - - if (asprintf(&pmu_name, "cpu_%s", type) < 0) - return NULL; - - if (perf_pmu__is_hybrid(pmu_name)) - return pmu_name; - - /* - * pmu may be not scanned, check the sysfs. - */ - if (perf_pmu__hybrid_mounted(pmu_name)) - return pmu_name; - - free(pmu_name); - return NULL; -} diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h index 2b186c26a43e..206b94931531 100644 --- a/tools/perf/util/pmu-hybrid.h +++ b/tools/perf/util/pmu-hybrid.h @@ -17,7 +17,6 @@ bool perf_pmu__hybrid_mounted(const char *name); struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name); bool perf_pmu__is_hybrid(const char *name); -char *perf_pmu__hybrid_type_to_pmu(const char *type); static inline int perf_pmu__hybrid_pmu_num(void) { diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 7f3b93c4d229..140e11f00b29 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -1,5 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include +#include +#include "pmus.h" +#include "pmu.h" LIST_HEAD(pmus); + +const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (!strcmp(pmu->name, str)) + return pmu; + /* Ignore "uncore_" prefix. */ + if (!strncmp(pmu->name, "uncore_", 7)) { + if (!strcmp(pmu->name + 7, str)) + return pmu; + } + /* Ignore "cpu_" prefix on Intel hybrid PMUs. */ + if (!strncmp(pmu->name, "cpu_", 4)) { + if (!strcmp(pmu->name + 4, str)) + return pmu; + } + } + return NULL; +} diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 5ec12007eb5c..d475e2960c10 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -3,7 +3,10 @@ #define __PMUS_H extern struct list_head pmus; +struct perf_pmu; #define perf_pmus__for_each_pmu(pmu) list_for_each_entry(pmu, &pmus, list) +const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); + #endif /* __PMUS_H */ -- cgit v1.2.3 From aefde50a446b56f592a589e12e89935bea3b85f9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:39 -0700 Subject: perf test: Fix parse-events tests for >1 core PMU Remove assumptions of just 1 core PMU. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-33-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 177 ++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 72 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 0d0c869d2d09..71c77d9d2744 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -35,6 +35,11 @@ static bool test_config(const struct evsel *evsel, __u64 expected_config) return config == expected_config; } +static bool test_perf_config(const struct perf_evsel *evsel, __u64 expected_config) +{ + return (evsel->attr.config & PERF_HW_EVENT_MASK) == expected_config; +} + #ifdef HAVE_LIBTRACEEVENT #if defined(__s390x__) @@ -97,11 +102,27 @@ static int test__checkevent_tracepoint_multi(struct evlist *evlist) static int test__checkevent_raw(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct perf_evsel *evsel; + bool raw_type_match = false; - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 0x1a)); + TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); + + perf_evlist__for_each_evsel(&evlist->core, evsel) { + struct perf_pmu *pmu; + bool type_matched = false; + + TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, 0x1a)); + perf_pmus__for_each_pmu(pmu) { + if (pmu->type == evsel->attr.type) { + TEST_ASSERT_VAL("PMU type expected once", !type_matched); + type_matched = true; + if (pmu->type == PERF_TYPE_RAW) + raw_type_match = true; + } + } + TEST_ASSERT_VAL("No PMU found for type", type_matched); + } + TEST_ASSERT_VAL("Raw PMU not matched", raw_type_match); return TEST_OK; } @@ -117,31 +138,35 @@ static int test__checkevent_numeric(struct evlist *evlist) static int test__checkevent_symbolic_name(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct perf_evsel *evsel; - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); + + perf_evlist__for_each_evsel(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", + test_perf_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + } return TEST_OK; } static int test__checkevent_symbolic_name_config(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct perf_evsel *evsel; - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - /* - * The period value gets configured within evlist__config, - * while this test executes only parse events method. - */ - TEST_ASSERT_VAL("wrong period", - 0 == evsel->core.attr.sample_period); - TEST_ASSERT_VAL("wrong config1", - 0 == evsel->core.attr.config1); - TEST_ASSERT_VAL("wrong config2", - 1 == evsel->core.attr.config2); + TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); + + perf_evlist__for_each_evsel(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + /* + * The period value gets configured within evlist__config, + * while this test executes only parse events method. + */ + TEST_ASSERT_VAL("wrong period", 0 == evsel->attr.sample_period); + TEST_ASSERT_VAL("wrong config1", 0 == evsel->attr.config1); + TEST_ASSERT_VAL("wrong config2", 1 == evsel->attr.config2); + } return TEST_OK; } @@ -157,11 +182,14 @@ static int test__checkevent_symbolic_alias(struct evlist *evlist) static int test__checkevent_genhw(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct perf_evsel *evsel; - TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 1 << 16)); + TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); + + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, 1 << 16)); + } return TEST_OK; } @@ -253,17 +281,15 @@ static int test__checkevent_tracepoint_modifier(struct evlist *evlist) static int test__checkevent_tracepoint_multi_modifier(struct evlist *evlist) { - struct evsel *evsel; + struct perf_evsel *evsel; TEST_ASSERT_VAL("wrong number of entries", evlist->core.nr_entries > 1); - evlist__for_each_entry(evlist, evsel) { - TEST_ASSERT_VAL("wrong exclude_user", - !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", - evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); } return test__checkevent_tracepoint_multi(evlist); @@ -272,25 +298,27 @@ test__checkevent_tracepoint_multi_modifier(struct evlist *evlist) static int test__checkevent_raw_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + struct perf_evsel *evsel; + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + } return test__checkevent_raw(evlist); } static int test__checkevent_numeric_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + struct perf_evsel *evsel; + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + } return test__checkevent_numeric(evlist); } @@ -308,21 +336,23 @@ static int test__checkevent_symbolic_name_modifier(struct evlist *evlist) static int test__checkevent_exclude_host_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + struct perf_evsel *evsel; + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host); + } return test__checkevent_symbolic_name(evlist); } static int test__checkevent_exclude_guest_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + struct perf_evsel *evsel; + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); + } return test__checkevent_symbolic_name(evlist); } @@ -340,13 +370,14 @@ static int test__checkevent_symbolic_alias_modifier(struct evlist *evlist) static int test__checkevent_genhw_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); - - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + struct perf_evsel *evsel; + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + } return test__checkevent_genhw(evlist); } @@ -476,21 +507,23 @@ static int test__checkevent_list(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); - TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", 3 <= evlist->core.nr_entries); /* r1 */ - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); - TEST_ASSERT_VAL("wrong config1", 0 == evsel->core.attr.config1); - TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2); - TEST_ASSERT_VAL("wrong config3", 0 == evsel->core.attr.config3); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT != evsel->core.attr.type); + while (evsel->core.attr.type != PERF_TYPE_TRACEPOINT) { + TEST_ASSERT_VAL("wrong config", test_config(evsel, 1)); + TEST_ASSERT_VAL("wrong config1", 0 == evsel->core.attr.config1); + TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2); + TEST_ASSERT_VAL("wrong config3", 0 == evsel->core.attr.config3); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + evsel = evsel__next(evsel); + } /* syscalls:sys_enter_openat:k */ - evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->core.attr.type); TEST_ASSERT_VAL("wrong sample_type", PERF_TP_SAMPLE_TYPE == evsel->core.attr.sample_type); @@ -1930,7 +1963,7 @@ static int test_event(const struct evlist_test *e) e->name, ret, err.str); parse_events_error__print(&err, e->name); ret = TEST_FAIL; - if (strstr(err.str, "can't access trace events")) + if (err.str && strstr(err.str, "can't access trace events")) ret = TEST_SKIP; } else { ret = e->check(evlist); -- cgit v1.2.3 From 5ea8f2ccffb23983f02012a2731464586b10fbf3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:40 -0700 Subject: perf parse-events: Support hardware events as terms An event like "cpu/instructions/" typically parses due to there being a sysfs event called instructions. On hybrid recursive parsing means that the hardware event is encoded in the attribute, with the PMU being placed in the high bits of the config: ''' $ perf stat -vv -e 'cpu_core/cycles/' true ... ------------------------------------------------------------ perf_event_attr: size 136 config 0x400000000 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ------------------------------------------------------------ ''' Make this behavior the default by adding a new term type and token for hardware events. The token gathers both the numeric config and the parsed name, so that if the token appears like "cycles/name=cycles/" then the token can be handled like a name. The numeric value isn't sufficient to distinguish say "cpu-cycles" from "cycles". Extend the parse-events test so that all current non-PMU hardware parsing tests, also test with the PMU cpu - more than half the change. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-34-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 126 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 37 ++++-------- tools/perf/util/parse-events.h | 3 +- tools/perf/util/parse-events.l | 20 +++++++ tools/perf/util/parse-events.y | 34 +++++++++-- 5 files changed, 187 insertions(+), 33 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 71c77d9d2744..9ca8e19bda00 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1926,6 +1926,132 @@ static const struct evlist_test test__events_pmu[] = { .check = test__checkevent_config_cache, /* 8 */ }, + { + .name = "cpu/instructions/", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_symbolic_name, + /* 9 */ + }, + { + .name = "cpu/cycles,period=100000,config2/", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_symbolic_name_config, + /* 0 */ + }, + { + .name = "cpu/instructions/h", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_symbolic_name_modifier, + /* 1 */ + }, + { + .name = "cpu/instructions/G", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_exclude_host_modifier, + /* 2 */ + }, + { + .name = "cpu/instructions/H", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_exclude_guest_modifier, + /* 3 */ + }, + { + .name = "{cpu/instructions/k,cpu/cycles/upp}", + .valid = test__pmu_cpu_valid, + .check = test__group1, + /* 4 */ + }, + { + .name = "{cpu/cycles/u,cpu/instructions/kp}:p", + .valid = test__pmu_cpu_valid, + .check = test__group4, + /* 5 */ + }, + { + .name = "{cpu/cycles/,cpu/cache-misses/G}:H", + .valid = test__pmu_cpu_valid, + .check = test__group_gh1, + /* 6 */ + }, + { + .name = "{cpu/cycles/,cpu/cache-misses/H}:G", + .valid = test__pmu_cpu_valid, + .check = test__group_gh2, + /* 7 */ + }, + { + .name = "{cpu/cycles/G,cpu/cache-misses/H}:u", + .valid = test__pmu_cpu_valid, + .check = test__group_gh3, + /* 8 */ + }, + { + .name = "{cpu/cycles/G,cpu/cache-misses/H}:uG", + .valid = test__pmu_cpu_valid, + .check = test__group_gh4, + /* 9 */ + }, + { + .name = "{cpu/cycles/,cpu/cache-misses/,cpu/branch-misses/}:S", + .valid = test__pmu_cpu_valid, + .check = test__leader_sample1, + /* 0 */ + }, + { + .name = "{cpu/instructions/,cpu/branch-misses/}:Su", + .valid = test__pmu_cpu_valid, + .check = test__leader_sample2, + /* 1 */ + }, + { + .name = "cpu/instructions/uDp", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_pinned_modifier, + /* 2 */ + }, + { + .name = "{cpu/cycles/,cpu/cache-misses/,cpu/branch-misses/}:D", + .valid = test__pmu_cpu_valid, + .check = test__pinned_group, + /* 3 */ + }, + { + .name = "cpu/instructions/I", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_exclude_idle_modifier, + /* 4 */ + }, + { + .name = "cpu/instructions/kIG", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_exclude_idle_modifier_1, + /* 5 */ + }, + { + .name = "cpu/cycles/u", + .valid = test__pmu_cpu_valid, + .check = test__sym_event_slash, + /* 6 */ + }, + { + .name = "cpu/cycles/k", + .valid = test__pmu_cpu_valid, + .check = test__sym_event_dc, + /* 7 */ + }, + { + .name = "cpu/instructions/uep", + .valid = test__pmu_cpu_valid, + .check = test__checkevent_exclusive_modifier, + /* 8 */ + }, + { + .name = "{cpu/cycles/,cpu/cache-misses/,cpu/branch-misses/}:e", + .valid = test__pmu_cpu_valid, + .check = test__exclusive_group, + /* 9 */ + }, }; struct terms_test { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d9d964bbc0e2..dea27bc0b376 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1052,6 +1052,7 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = { [PARSE_EVENTS__TERM_TYPE_METRIC_ID] = "metric-id", [PARSE_EVENTS__TERM_TYPE_RAW] = "raw", [PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE] = "legacy-cache", + [PARSE_EVENTS__TERM_TYPE_HARDWARE] = "hardware", }; static bool config_term_shrinked; @@ -1239,6 +1240,17 @@ static int config_term_pmu(struct perf_event_attr *attr, } else term->type_term = PARSE_EVENTS__TERM_TYPE_USER; } + if (term->type_term == PARSE_EVENTS__TERM_TYPE_HARDWARE) { + const struct perf_pmu *pmu = perf_pmu__find_by_type(attr->type); + + if (!pmu) { + pr_debug("Failed to find PMU for type %d", attr->type); + return -EINVAL; + } + attr->type = PERF_TYPE_HARDWARE; + attr->config = ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT) | term->val.num; + return 0; + } if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER || term->type_term == PARSE_EVENTS__TERM_TYPE_DRV_CFG) { /* @@ -2569,31 +2581,6 @@ int parse_events_term__str(struct parse_events_term **term, return new_term(term, &temp, str, 0); } -int parse_events_term__sym_hw(struct parse_events_term **term, - char *config, unsigned idx) -{ - struct event_symbol *sym; - char *str; - struct parse_events_term temp = { - .type_val = PARSE_EVENTS__TERM_TYPE_STR, - .type_term = PARSE_EVENTS__TERM_TYPE_USER, - .config = config, - }; - - if (!temp.config) { - temp.config = strdup("event"); - if (!temp.config) - return -ENOMEM; - } - BUG_ON(idx >= PERF_COUNT_HW_MAX); - sym = &event_symbols_hw[idx]; - - str = strdup(sym->symbol); - if (!str) - return -ENOMEM; - return new_term(term, &temp, str, 0); -} - int parse_events_term__clone(struct parse_events_term **new, struct parse_events_term *term) { diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index d4cbda6e946a..7fe80b416143 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -79,6 +79,7 @@ enum { PARSE_EVENTS__TERM_TYPE_METRIC_ID, PARSE_EVENTS__TERM_TYPE_RAW, PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE, + PARSE_EVENTS__TERM_TYPE_HARDWARE, __PARSE_EVENTS__TERM_TYPE_NR, }; @@ -147,8 +148,6 @@ int parse_events_term__num(struct parse_events_term **term, int parse_events_term__str(struct parse_events_term **term, int type_term, char *config, char *str, void *loc_term, void *loc_val); -int parse_events_term__sym_hw(struct parse_events_term **term, - char *config, unsigned idx); int parse_events_term__clone(struct parse_events_term **new, struct parse_events_term *term); void parse_events_term__delete(struct parse_events_term *term); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index abe0ce681d29..6deb70c25984 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -149,6 +149,16 @@ static int term(yyscan_t scanner, int type) return PE_TERM; } +static int hw_term(yyscan_t scanner, int config) +{ + YYSTYPE *yylval = parse_events_get_lval(scanner); + char *text = parse_events_get_text(scanner); + + yylval->hardware_term.str = strdup(text); + yylval->hardware_term.num = PERF_TYPE_HARDWARE + config; + return PE_TERM_HW; +} + #define YY_USER_ACTION \ do { \ yylloc->last_column = yylloc->first_column; \ @@ -269,6 +279,16 @@ percore { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_PERCORE); } aux-output { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } metric-id { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } +cpu-cycles|cycles { return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } +stalled-cycles-frontend|idle-cycles-frontend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } +stalled-cycles-backend|idle-cycles-backend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } +instructions { return hw_term(yyscanner, PERF_COUNT_HW_INSTRUCTIONS); } +cache-references { return hw_term(yyscanner, PERF_COUNT_HW_CACHE_REFERENCES); } +cache-misses { return hw_term(yyscanner, PERF_COUNT_HW_CACHE_MISSES); } +branch-instructions|branches { return hw_term(yyscanner, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +branch-misses { return hw_term(yyscanner, PERF_COUNT_HW_BRANCH_MISSES); } +bus-cycles { return hw_term(yyscanner, PERF_COUNT_HW_BUS_CYCLES); } +ref-cycles { return hw_term(yyscanner, PERF_COUNT_HW_REF_CPU_CYCLES); } r{num_raw_hex} { return str(yyscanner, PE_RAW); } r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } , { return ','; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index c95877cbd6cf..819a5123fd77 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -65,6 +65,7 @@ static void free_list_evsel(struct list_head* list_evsel) %token PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %token PE_ARRAY_ALL PE_ARRAY_RANGE %token PE_DRV_CFG_TERM +%token PE_TERM_HW %type PE_VALUE %type PE_VALUE_SYM_HW %type PE_VALUE_SYM_SW @@ -112,6 +113,8 @@ static void free_list_evsel(struct list_head* list_evsel) %type array_term %type array_terms %destructor { free ($$.ranges); } +%type PE_TERM_HW +%destructor { free ($$.str); } %union { @@ -125,6 +128,10 @@ static void free_list_evsel(struct list_head* list_evsel) char *event; } tracepoint_name; struct parse_events_array array; + struct hardware_term { + char *str; + u64 num; + } hardware_term; } %% @@ -770,13 +777,14 @@ name_or_raw '=' PE_VALUE $$ = term; } | -name_or_raw '=' PE_VALUE_SYM_HW +name_or_raw '=' PE_TERM_HW { struct parse_events_term *term; - int config = $3 & 255; - if (parse_events_term__sym_hw(&term, $1, config)) { + if (parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER, + $1, $3.str, &@1, &@3)) { free($1); + free($3.str); YYABORT; } $$ = term; @@ -806,12 +814,15 @@ PE_NAME $$ = term; } | -PE_VALUE_SYM_HW +PE_TERM_HW { struct parse_events_term *term; - int config = $1 & 255; - ABORT_ON(parse_events_term__sym_hw(&term, NULL, config)); + if (parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_HARDWARE, + $1.str, $1.num & 255, false, &@1, NULL)) { + free($1.str); + YYABORT; + } $$ = term; } | @@ -826,6 +837,17 @@ PE_TERM '=' PE_NAME $$ = term; } | +PE_TERM '=' PE_TERM_HW +{ + struct parse_events_term *term; + + if (parse_events_term__str(&term, (int)$1, NULL, $3.str, &@1, &@3)) { + free($3.str); + YYABORT; + } + $$ = term; +} +| PE_TERM '=' PE_VALUE { struct parse_events_term *term; -- cgit v1.2.3 From e831f3ccf9920fa099d4ebb9d57214cc7ecd2e70 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:41 -0700 Subject: perf parse-events: Avoid error when assigning a term Avoid the parser error: ''' $ perf stat -e 'cycles/name=name/' true event syntax error: 'cycles/name=name/' \___ parser error ''' by turning the term back to a string if it is on the right. Add PMU and generic parsing tests. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-35-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 21 +++++++++++++++++++++ tools/perf/util/parse-events.c | 9 +++++++++ tools/perf/util/parse-events.h | 3 +++ tools/perf/util/parse-events.y | 8 ++++++++ 4 files changed, 41 insertions(+) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 9ca8e19bda00..eb893cc15878 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1499,6 +1499,16 @@ static int test__sym_event_dc(struct evlist *evlist) return TEST_OK; } +static int test__term_equal_term(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "name") == 0); + return TEST_OK; +} + #ifdef HAVE_LIBTRACEEVENT static int count_tracepoints(void) { @@ -1871,6 +1881,11 @@ static const struct evlist_test test__events[] = { .check = test__exclusive_group, /* 7 */ }, + { + .name = "cycles/name=name/", + .check = test__term_equal_term, + /* 8 */ + }, }; static const struct evlist_test test__events_pmu[] = { @@ -2052,6 +2067,12 @@ static const struct evlist_test test__events_pmu[] = { .check = test__exclusive_group, /* 9 */ }, + { + .name = "cpu/cycles,name=name/", + .valid = test__pmu_cpu_valid, + .check = test__term_equal_term, + /* 0 */ + }, }; struct terms_test { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index dea27bc0b376..5d5d77fa398b 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2581,6 +2581,15 @@ int parse_events_term__str(struct parse_events_term **term, return new_term(term, &temp, str, 0); } +int parse_events_term__term(struct parse_events_term **term, + int term_lhs, int term_rhs, + void *loc_term, void *loc_val) +{ + return parse_events_term__str(term, term_lhs, NULL, + strdup(config_term_names[term_rhs]), + loc_term, loc_val); +} + int parse_events_term__clone(struct parse_events_term **new, struct parse_events_term *term) { diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 7fe80b416143..2a8cafe0ee8f 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -148,6 +148,9 @@ int parse_events_term__num(struct parse_events_term **term, int parse_events_term__str(struct parse_events_term **term, int type_term, char *config, char *str, void *loc_term, void *loc_val); +int parse_events_term__term(struct parse_events_term **term, + int term_lhs, int term_rhs, + void *loc_term, void *loc_val); int parse_events_term__clone(struct parse_events_term **new, struct parse_events_term *term); void parse_events_term__delete(struct parse_events_term *term); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 819a5123fd77..0aaebc57748e 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -848,6 +848,14 @@ PE_TERM '=' PE_TERM_HW $$ = term; } | +PE_TERM '=' PE_TERM +{ + struct parse_events_term *term; + + ABORT_ON(parse_events_term__term(&term, (int)$1, (int)$3, &@1, &@3)); + $$ = term; +} +| PE_TERM '=' PE_VALUE { struct parse_events_term *term; -- cgit v1.2.3 From 2aadca4b35427a7c65acc6aa415b38758128b22c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:42 -0700 Subject: perf parse-events: Avoid error when assigning a legacy cache term Avoid the parser error: ''' $ perf stat -e 'cycles/name=l1d/' true event syntax error: 'cycles/name=l1d/' \___ parser error ''' by combining the name and legacy cache cases in the parser. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-36-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 21 +++++++++++++++++++++ tools/perf/util/parse-events.y | 10 ++++++---- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index eb893cc15878..72a10bed84fd 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1509,6 +1509,16 @@ static int test__term_equal_term(struct evlist *evlist) return TEST_OK; } +static int test__term_equal_legacy(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "l1d") == 0); + return TEST_OK; +} + #ifdef HAVE_LIBTRACEEVENT static int count_tracepoints(void) { @@ -1886,6 +1896,11 @@ static const struct evlist_test test__events[] = { .check = test__term_equal_term, /* 8 */ }, + { + .name = "cycles/name=l1d/", + .check = test__term_equal_legacy, + /* 9 */ + }, }; static const struct evlist_test test__events_pmu[] = { @@ -2073,6 +2088,12 @@ static const struct evlist_test test__events_pmu[] = { .check = test__term_equal_term, /* 0 */ }, + { + .name = "cpu/cycles,name=l1d/", + .valid = test__pmu_cpu_valid, + .check = test__term_equal_legacy, + /* 1 */ + }, }; struct terms_test { diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 0aaebc57748e..f4ee03b5976b 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -82,7 +82,7 @@ static void free_list_evsel(struct list_head* list_evsel) %type PE_EVENT_NAME %type PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type PE_DRV_CFG_TERM -%type name_or_raw +%type name_or_raw name_or_legacy %destructor { free ($$); } %type event_term %destructor { parse_events_term__delete ($$); } @@ -739,6 +739,8 @@ event_term name_or_raw: PE_RAW | PE_NAME | PE_LEGACY_CACHE +name_or_legacy: PE_NAME | PE_LEGACY_CACHE + event_term: PE_RAW { @@ -752,7 +754,7 @@ PE_RAW $$ = term; } | -name_or_raw '=' PE_NAME +name_or_raw '=' name_or_legacy { struct parse_events_term *term; @@ -826,7 +828,7 @@ PE_TERM_HW $$ = term; } | -PE_TERM '=' PE_NAME +PE_TERM '=' name_or_legacy { struct parse_events_term *term; @@ -872,7 +874,7 @@ PE_TERM $$ = term; } | -name_or_raw array '=' PE_NAME +name_or_raw array '=' name_or_legacy { struct parse_events_term *term; -- cgit v1.2.3 From 52c7b4d3f9c12c44b8392765c73cced3be99cec6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:43 -0700 Subject: perf parse-events: Don't auto merge hybrid wildcard events Bring back the behavior of not auto-merging hybrid events by delegating to a test in pmu. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-37-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 5 ++++- tools/perf/util/parse-events.y | 4 +++- tools/perf/util/pmu.c | 5 +++++ tools/perf/util/pmu.h | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 5d5d77fa398b..2dad88a6bf19 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1714,16 +1714,19 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, while ((pmu = perf_pmu__scan(pmu)) != NULL) { struct perf_pmu_alias *alias; + bool auto_merge_stats; if (parse_events__filter_pmu(parse_state, pmu)) continue; + auto_merge_stats = perf_pmu__auto_merge_stats(pmu); + list_for_each_entry(alias, &pmu->aliases, list) { if (!strcasecmp(alias->name, str)) { parse_events_copy_term_list(head, &orig_head); if (!parse_events_add_pmu(parse_state, list, pmu->name, orig_head, - /*auto_merge_stats=*/true)) { + auto_merge_stats)) { pr_debug("%s -> %s/%s/\n", str, pmu->name, alias->str); ok++; diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index f4ee03b5976b..4e1f5de35be8 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -327,10 +327,12 @@ PE_NAME opt_pmu_config name += 7; if (!perf_pmu__match(pattern, name, $1) || !perf_pmu__match(pattern, pmu->alias_name, $1)) { + bool auto_merge_stats = perf_pmu__auto_merge_stats(pmu); + if (parse_events_copy_term_list(orig_terms, &terms)) CLEANUP_YYABORT; if (!parse_events_add_pmu(parse_state, list, pmu->name, terms, - /*auto_merge_stats=*/true)) { + auto_merge_stats)) { ok++; parse_state->wild_card_pmus = true; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index cd4247a379d4..f4f0afbc391c 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1660,6 +1660,11 @@ bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu) return is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); } +bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) +{ + return !perf_pmu__is_hybrid(pmu->name); +} + static bool pmu_alias_is_duplicate(struct sevent *alias_a, struct sevent *alias_b) { diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 5a19536a5449..0e0cb6283594 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -222,6 +222,7 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu); bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu); +bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); void print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool pmu_have_event(const char *pname, const char *name); -- cgit v1.2.3 From 5136e43c6139d4d400effb33643eb171dc4c3bfa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:44 -0700 Subject: perf parse-events: Don't reorder atom cpu events On hybrid systems the topdown events don't share a fixed counter on the atom core, so they don't require the sorting the perf metric supporting PMUs do. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-38-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/evlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c index d4193479a364..1b6065841fb0 100644 --- a/tools/perf/arch/x86/util/evlist.c +++ b/tools/perf/arch/x86/util/evlist.c @@ -6,6 +6,7 @@ #include "util/event.h" #include "util/pmu-hybrid.h" #include "topdown.h" +#include "evsel.h" static int ___evlist__add_default_attrs(struct evlist *evlist, struct perf_event_attr *attrs, @@ -67,8 +68,7 @@ int arch_evlist__add_default_attrs(struct evlist *evlist, int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs) { - if (topdown_sys_has_perf_metrics() && - (!lhs->pmu_name || !strncmp(lhs->pmu_name, "cpu", 3))) { + if (topdown_sys_has_perf_metrics() && evsel__sys_has_perf_metrics(lhs)) { /* Ensure the topdown slots comes first. */ if (strcasestr(lhs->name, "slots")) return -1; -- cgit v1.2.3 From bd3846d0fea2e8e3375fc54a6556561726f466cf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:45 -0700 Subject: perf metrics: Be PMU specific for referenced metrics. Hybrid systems may define the same metric for different PMUs, this can cause confusion of events. To avoid this make the referenced metric searches PMU specific, matching that in the table. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-39-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 6 +-- tools/perf/pmu-events/jevents.py | 4 +- tools/perf/pmu-events/pmu-events.h | 1 + tools/perf/util/metricgroup.c | 94 +++++++++++++++++++++++++++----------- tools/perf/util/metricgroup.h | 2 +- 5 files changed, 75 insertions(+), 32 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index fe9c6fa3f14a..8161f922715c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1794,7 +1794,7 @@ static int add_default_attributes(void) * will use this approach. To determine transaction support * on an architecture test for such a metric name. */ - if (!metricgroup__has_metric("transaction")) { + if (!metricgroup__has_metric("all", "transaction")) { pr_err("Missing transaction metrics"); return -1; } @@ -1823,7 +1823,7 @@ static int add_default_attributes(void) smi_reset = true; } - if (!metricgroup__has_metric("smi")) { + if (!metricgroup__has_metric("all", "smi")) { pr_err("Missing smi metrics"); return -1; } @@ -1903,7 +1903,7 @@ static int add_default_attributes(void) * caused by exposing latent bugs. This is fixed properly in: * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ */ - if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid()) { + if (metricgroup__has_metric("all", "TopdownL1") && !perf_pmu__has_hybrid()) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index f57a8f274025..b18dd2fcbf04 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -51,8 +51,8 @@ _json_event_attributes = [ # Attributes that are in pmu_metric rather than pmu_event. _json_metric_attributes = [ - 'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', 'desc', - 'long_desc', 'unit', 'compat', 'metricgroup_no_group', 'aggr_mode', + 'pmu', 'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', + 'desc', 'long_desc', 'unit', 'compat', 'metricgroup_no_group', 'aggr_mode', 'event_grouping' ] # Attributes that are bools or enum int values, encoded as '0', '1',... diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index 80349685cf4d..3549e6971a4d 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -51,6 +51,7 @@ struct pmu_event { }; struct pmu_metric { + const char *pmu; const char *metric_name; const char *metric_group; const char *metric_expr; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 908124dab122..cc5166d7f138 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -123,6 +123,7 @@ struct metric { * within the expression. */ struct expr_parse_ctx *pctx; + const char *pmu; /** The name of the metric such as "IPC". */ const char *metric_name; /** Modifier on the metric such as "u" or NULL for none. */ @@ -216,6 +217,7 @@ static struct metric *metric__new(const struct pmu_metric *pm, if (!m->pctx) goto out_err; + m->pmu = pm->pmu ?: "cpu"; m->metric_name = pm->metric_name; m->modifier = NULL; if (modifier) { @@ -259,11 +261,12 @@ static bool contains_metric_id(struct evsel **metric_events, int num_events, /** * setup_metric_events - Find a group of events in metric_evlist that correspond * to the IDs from a parsed metric expression. + * @pmu: The PMU for the IDs. * @ids: the metric IDs to match. * @metric_evlist: the list of perf events. * @out_metric_events: holds the created metric events array. */ -static int setup_metric_events(struct hashmap *ids, +static int setup_metric_events(const char *pmu, struct hashmap *ids, struct evlist *metric_evlist, struct evsel ***out_metric_events) { @@ -271,6 +274,7 @@ static int setup_metric_events(struct hashmap *ids, const char *metric_id; struct evsel *ev; size_t ids_size, matched_events, i; + bool all_pmus = !strcmp(pmu, "all"); *out_metric_events = NULL; ids_size = hashmap__size(ids); @@ -283,6 +287,8 @@ static int setup_metric_events(struct hashmap *ids, evlist__for_each_entry(metric_evlist, ev) { struct expr_id_data *val_ptr; + if (!all_pmus && strcmp(ev->pmu_name, pmu)) + continue; /* * Check for duplicate events with the same name. For * example, uncore_imc/cas_count_read/ will turn into 6 @@ -355,8 +361,13 @@ static bool match_metric(const char *n, const char *list) return false; } -static bool match_pm_metric(const struct pmu_metric *pm, const char *metric) +static bool match_pm_metric(const struct pmu_metric *pm, const char *pmu, const char *metric) { + const char *pm_pmu = pm->pmu ?: "cpu"; + + if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu)) + return false; + return match_metric(pm->metric_group, metric) || match_metric(pm->metric_name, metric); } @@ -766,6 +777,7 @@ struct visited_metric { struct metricgroup_add_iter_data { struct list_head *metric_list; + const char *pmu; const char *metric_name; const char *modifier; int *ret; @@ -779,7 +791,8 @@ struct metricgroup_add_iter_data { const struct pmu_metrics_table *table; }; -static bool metricgroup__find_metric(const char *metric, +static bool metricgroup__find_metric(const char *pmu, + const char *metric, const struct pmu_metrics_table *table, struct pmu_metric *pm); @@ -798,6 +811,7 @@ static int add_metric(struct list_head *metric_list, * resolve_metric - Locate metrics within the root metric and recursively add * references to them. * @metric_list: The list the metric is added to. + * @pmu: The PMU name to resolve metrics on, or "all" for all PMUs. * @modifier: if non-null event modifiers like "u". * @metric_no_group: Should events written to events be grouped "{}" or * global. Grouping is the default but due to multiplexing the @@ -813,6 +827,7 @@ static int add_metric(struct list_head *metric_list, * architecture perf is running upon. */ static int resolve_metric(struct list_head *metric_list, + const char *pmu, const char *modifier, bool metric_no_group, bool metric_no_threshold, @@ -842,7 +857,7 @@ static int resolve_metric(struct list_head *metric_list, hashmap__for_each_entry(root_metric->pctx->ids, cur, bkt) { struct pmu_metric pm; - if (metricgroup__find_metric(cur->pkey, table, &pm)) { + if (metricgroup__find_metric(pmu, cur->pkey, table, &pm)) { pending = realloc(pending, (pending_cnt + 1) * sizeof(struct to_resolve)); if (!pending) @@ -993,9 +1008,12 @@ static int __add_metric(struct list_head *metric_list, } if (!ret) { /* Resolve referenced metrics. */ - ret = resolve_metric(metric_list, modifier, metric_no_group, + const char *pmu = pm->pmu ?: "cpu"; + + ret = resolve_metric(metric_list, pmu, modifier, metric_no_group, metric_no_threshold, user_requested_cpu_list, - system_wide, root_metric, &visited_node, table); + system_wide, root_metric, &visited_node, + table); } if (ret) { if (is_root) @@ -1008,6 +1026,7 @@ static int __add_metric(struct list_head *metric_list, } struct metricgroup__find_metric_data { + const char *pmu; const char *metric; struct pmu_metric *pm; }; @@ -1017,6 +1036,10 @@ static int metricgroup__find_metric_callback(const struct pmu_metric *pm, void *vdata) { struct metricgroup__find_metric_data *data = vdata; + const char *pm_pmu = pm->pmu ?: "cpu"; + + if (strcmp(data->pmu, "all") && strcmp(pm_pmu, data->pmu)) + return 0; if (!match_metric(pm->metric_name, data->metric)) return 0; @@ -1025,11 +1048,13 @@ static int metricgroup__find_metric_callback(const struct pmu_metric *pm, return 1; } -static bool metricgroup__find_metric(const char *metric, +static bool metricgroup__find_metric(const char *pmu, + const char *metric, const struct pmu_metrics_table *table, struct pmu_metric *pm) { struct metricgroup__find_metric_data data = { + .pmu = pmu, .metric = metric, .pm = pm, }; @@ -1083,7 +1108,7 @@ static int metricgroup__add_metric_sys_event_iter(const struct pmu_metric *pm, struct metricgroup_add_iter_data *d = data; int ret; - if (!match_pm_metric(pm, d->metric_name)) + if (!match_pm_metric(pm, d->pmu, d->metric_name)) return 0; ret = add_metric(d->metric_list, pm, d->modifier, d->metric_no_group, @@ -1128,6 +1153,7 @@ static int metric_list_cmp(void *priv __maybe_unused, const struct list_head *l, struct metricgroup__add_metric_data { struct list_head *list; + const char *pmu; const char *metric_name; const char *modifier; const char *user_requested_cpu_list; @@ -1144,7 +1170,7 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm, struct metricgroup__add_metric_data *data = vdata; int ret = 0; - if (pm->metric_expr && match_pm_metric(pm, data->metric_name)) { + if (pm->metric_expr && match_pm_metric(pm, data->pmu, data->metric_name)) { bool metric_no_group = data->metric_no_group || match_metric(data->metric_name, pm->metricgroup_no_group); @@ -1159,6 +1185,7 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm, /** * metricgroup__add_metric - Find and add a metric, or a metric group. + * @pmu: The PMU name to search for metrics on, or "all" for all PMUs. * @metric_name: The name of the metric or metric group. For example, "IPC" * could be the name of a metric and "TopDownL1" the name of a * metric group. @@ -1172,7 +1199,7 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm, * @table: The table that is searched for metrics, most commonly the table for the * architecture perf is running upon. */ -static int metricgroup__add_metric(const char *metric_name, const char *modifier, +static int metricgroup__add_metric(const char *pmu, const char *metric_name, const char *modifier, bool metric_no_group, bool metric_no_threshold, const char *user_requested_cpu_list, bool system_wide, @@ -1186,6 +1213,7 @@ static int metricgroup__add_metric(const char *metric_name, const char *modifier { struct metricgroup__add_metric_data data = { .list = &list, + .pmu = pmu, .metric_name = metric_name, .modifier = modifier, .metric_no_group = metric_no_group, @@ -1210,6 +1238,7 @@ static int metricgroup__add_metric(const char *metric_name, const char *modifier .fn = metricgroup__add_metric_sys_event_iter, .data = (void *) &(struct metricgroup_add_iter_data) { .metric_list = &list, + .pmu = pmu, .metric_name = metric_name, .modifier = modifier, .metric_no_group = metric_no_group, @@ -1239,6 +1268,7 @@ out: /** * metricgroup__add_metric_list - Find and add metrics, or metric groups, * specified in a list. + * @pmu: A pmu to restrict the metrics to, or "all" for all PMUS. * @list: the list of metrics or metric groups. For example, "IPC,CPI,TopDownL1" * would match the IPC and CPI metrics, and TopDownL1 would match all * the metrics in the TopDownL1 group. @@ -1251,7 +1281,8 @@ out: * @table: The table that is searched for metrics, most commonly the table for the * architecture perf is running upon. */ -static int metricgroup__add_metric_list(const char *list, bool metric_no_group, +static int metricgroup__add_metric_list(const char *pmu, const char *list, + bool metric_no_group, bool metric_no_threshold, const char *user_requested_cpu_list, bool system_wide, struct list_head *metric_list, @@ -1270,7 +1301,7 @@ static int metricgroup__add_metric_list(const char *list, bool metric_no_group, if (modifier) *modifier++ = '\0'; - ret = metricgroup__add_metric(metric_name, modifier, + ret = metricgroup__add_metric(pmu, metric_name, modifier, metric_no_group, metric_no_threshold, user_requested_cpu_list, system_wide, metric_list, table); @@ -1460,7 +1491,8 @@ err_out: return ret; } -static int parse_groups(struct evlist *perf_evlist, const char *str, +static int parse_groups(struct evlist *perf_evlist, + const char *pmu, const char *str, bool metric_no_group, bool metric_no_merge, bool metric_no_threshold, @@ -1478,7 +1510,7 @@ static int parse_groups(struct evlist *perf_evlist, const char *str, if (metric_events_list->nr_entries == 0) metricgroup__rblist_init(metric_events_list); - ret = metricgroup__add_metric_list(str, metric_no_group, metric_no_threshold, + ret = metricgroup__add_metric_list(pmu, str, metric_no_group, metric_no_threshold, user_requested_cpu_list, system_wide, &metric_list, table); if (ret) @@ -1535,6 +1567,11 @@ static int parse_groups(struct evlist *perf_evlist, const char *str, strcmp(m->modifier, n->modifier))) continue; + if ((!m->pmu && n->pmu) || + (m->pmu && !n->pmu) || + (m->pmu && n->pmu && strcmp(m->pmu, n->pmu))) + continue; + if (expr__subset_of_ids(n->pctx, m->pctx)) { pr_debug("Events in '%s' fully contained within '%s'\n", m->metric_name, n->metric_name); @@ -1552,7 +1589,8 @@ static int parse_groups(struct evlist *perf_evlist, const char *str, metric_evlist = m->evlist; } - ret = setup_metric_events(m->pctx->ids, metric_evlist, &metric_events); + ret = setup_metric_events(fake_pmu ? "all" : m->pmu, m->pctx->ids, + metric_evlist, &metric_events); if (ret) { pr_debug("Cannot resolve IDs for %s: %s\n", m->metric_name, m->metric_expr); @@ -1623,7 +1661,7 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, if (!table) return -EINVAL; - return parse_groups(perf_evlist, str, metric_no_group, metric_no_merge, + return parse_groups(perf_evlist, "all", str, metric_no_group, metric_no_merge, metric_no_threshold, user_requested_cpu_list, system_wide, /*fake_pmu=*/NULL, metric_events, table); } @@ -1633,7 +1671,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist, const char *str, struct rblist *metric_events) { - return parse_groups(evlist, str, + return parse_groups(evlist, "all", str, /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/false, @@ -1642,28 +1680,32 @@ int metricgroup__parse_groups_test(struct evlist *evlist, &perf_pmu__fake, metric_events, table); } +struct metricgroup__has_metric_data { + const char *pmu; + const char *metric; +}; static int metricgroup__has_metric_callback(const struct pmu_metric *pm, const struct pmu_metrics_table *table __maybe_unused, void *vdata) { - const char *metric = vdata; - - if (match_metric(pm->metric_name, metric) || - match_metric(pm->metric_group, metric)) - return 1; + struct metricgroup__has_metric_data *data = vdata; - return 0; + return match_pm_metric(pm, data->pmu, data->metric) ? 1 : 0; } -bool metricgroup__has_metric(const char *metric) +bool metricgroup__has_metric(const char *pmu, const char *metric) { const struct pmu_metrics_table *table = pmu_metrics_table__find(); + struct metricgroup__has_metric_data data = { + .pmu = pmu, + .metric = metric, + }; if (!table) return false; - return pmu_metrics_table_for_each_metric(table, metricgroup__has_metric_callback, - (void *)metric) ? true : false; + return pmu_metrics_table_for_each_metric(table, metricgroup__has_metric_callback, &data) + ? true : false; } static int metricgroup__topdown_max_level_callback(const struct pmu_metric *pm, diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index 77472e35705e..08e9b9e953ec 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -80,7 +80,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist, struct rblist *metric_events); void metricgroup__print(const struct print_callbacks *print_cb, void *print_state); -bool metricgroup__has_metric(const char *metric); +bool metricgroup__has_metric(const char *pmu, const char *metric); unsigned int metricgroups__topdown_max_level(void); int arch_get_runtimeparam(const struct pmu_metric *pm); void metricgroup__rblist_exit(struct rblist *metric_events); -- cgit v1.2.3 From dae47d3940a77e1639edb0c5f0596f43bcff8bf8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:46 -0700 Subject: perf stat: Command line PMU metric filtering Wire up the --cputype value to limit which metrics are parsed. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-40-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 20 ++++++++++++-------- tools/perf/util/metricgroup.c | 3 ++- tools/perf/util/metricgroup.h | 1 + 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 8161f922715c..e18b3239d42a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1783,6 +1783,7 @@ static int add_default_attributes(void) }; struct perf_event_attr default_null_attrs[] = {}; + const char *pmu = parse_events_option_args.pmu_filter ?: "all"; /* Set attrs if no event is selected and !null_run: */ if (stat_config.null_run) @@ -1794,11 +1795,11 @@ static int add_default_attributes(void) * will use this approach. To determine transaction support * on an architecture test for such a metric name. */ - if (!metricgroup__has_metric("all", "transaction")) { + if (!metricgroup__has_metric(pmu, "transaction")) { pr_err("Missing transaction metrics"); return -1; } - return metricgroup__parse_groups(evsel_list, "transaction", + return metricgroup__parse_groups(evsel_list, pmu, "transaction", stat_config.metric_no_group, stat_config.metric_no_merge, stat_config.metric_no_threshold, @@ -1823,7 +1824,7 @@ static int add_default_attributes(void) smi_reset = true; } - if (!metricgroup__has_metric("all", "smi")) { + if (!metricgroup__has_metric(pmu, "smi")) { pr_err("Missing smi metrics"); return -1; } @@ -1831,7 +1832,7 @@ static int add_default_attributes(void) if (!force_metric_only) stat_config.metric_only = true; - return metricgroup__parse_groups(evsel_list, "smi", + return metricgroup__parse_groups(evsel_list, pmu, "smi", stat_config.metric_no_group, stat_config.metric_no_merge, stat_config.metric_no_threshold, @@ -1864,7 +1865,8 @@ static int add_default_attributes(void) "Please print the result regularly, e.g. -I1000\n"); } str[8] = stat_config.topdown_level + '0'; - if (metricgroup__parse_groups(evsel_list, str, + if (metricgroup__parse_groups(evsel_list, + pmu, str, /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/true, @@ -1903,14 +1905,14 @@ static int add_default_attributes(void) * caused by exposing latent bugs. This is fixed properly in: * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ */ - if (metricgroup__has_metric("all", "TopdownL1") && !perf_pmu__has_hybrid()) { + if (metricgroup__has_metric(pmu, "TopdownL1") && !perf_pmu__has_hybrid()) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; if (!metric_evlist) return -1; - if (metricgroup__parse_groups(metric_evlist, "TopdownL1", + if (metricgroup__parse_groups(metric_evlist, pmu, "TopdownL1", /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/true, @@ -2434,7 +2436,9 @@ int cmd_stat(int argc, const char **argv) * knowing the target is system-wide. */ if (metrics) { - metricgroup__parse_groups(evsel_list, metrics, + const char *pmu = parse_events_option_args.pmu_filter ?: "all"; + + metricgroup__parse_groups(evsel_list, pmu, metrics, stat_config.metric_no_group, stat_config.metric_no_merge, stat_config.metric_no_threshold, diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index cc5166d7f138..103a672bb132 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1648,6 +1648,7 @@ out: } int metricgroup__parse_groups(struct evlist *perf_evlist, + const char *pmu, const char *str, bool metric_no_group, bool metric_no_merge, @@ -1661,7 +1662,7 @@ int metricgroup__parse_groups(struct evlist *perf_evlist, if (!table) return -EINVAL; - return parse_groups(perf_evlist, "all", str, metric_no_group, metric_no_merge, + return parse_groups(perf_evlist, pmu, str, metric_no_group, metric_no_merge, metric_no_threshold, user_requested_cpu_list, system_wide, /*fake_pmu=*/NULL, metric_events, table); } diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index 08e9b9e953ec..bf18274c15df 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -67,6 +67,7 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events, struct evsel *evsel, bool create); int metricgroup__parse_groups(struct evlist *perf_evlist, + const char *pmu, const char *str, bool metric_no_group, bool metric_no_merge, -- cgit v1.2.3 From 1b8012b26f78b2dd124214256481c8643e4465e9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:47 -0700 Subject: perf vendor events intel: Correct alderlake metrics Fix the metrics tma_memory_bound on alderlake cpu_core and tma_microcode_sequencer on alderlake cpu_atom, where metrics had be rewritten across PMUs. Fix MEM_BOUND_STALLS_AT_RET_CORRECTION which is an aux metric but lacks a hash prefix. Add PMU prefixes for cpu_core/cpu_atom events to avoid wildcard opening the events. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-41-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/alderlake/adl-metrics.json | 238 ++++++++++----------- .../arch/x86/alderlaken/adln-metrics.json | 6 +- 2 files changed, 122 insertions(+), 122 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index 4c2a14ea5a1c..840f6f6fc8c5 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -151,7 +151,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", - "MetricExpr": "(tma_info_slots - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / tma_info_slots", + "MetricExpr": "(tma_info_slots - (cpu_atom@TOPDOWN_FE_BOUND.ALL@ + cpu_atom@TOPDOWN_BE_BOUND.ALL@ + cpu_atom@TOPDOWN_RETIRING.ALL@)) / tma_info_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -162,7 +162,7 @@ }, { "BriefDescription": "Counts the number of uops that are not from the microsequencer.", - "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS) / tma_info_slots", + "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@) / tma_info_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_base", "MetricThreshold": "tma_base > 0.6", @@ -229,7 +229,7 @@ }, { "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory disambiguation.", - "MetricExpr": "tma_nuke * (MACHINE_CLEARS.DISAMBIGUATION / MACHINE_CLEARS.SLOW)", + "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.DISAMBIGUATION@ / cpu_atom@MACHINE_CLEARS.SLOW@)", "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group", "MetricName": "tma_disambiguation", "MetricThreshold": "tma_disambiguation > 0.02", @@ -239,7 +239,7 @@ { "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / tma_info_clks - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / tma_info_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1", @@ -277,7 +277,7 @@ }, { "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to FP assists.", - "MetricExpr": "tma_nuke * (MACHINE_CLEARS.FP_ASSIST / MACHINE_CLEARS.SLOW)", + "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.FP_ASSIST@ / cpu_atom@MACHINE_CLEARS.SLOW@)", "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group", "MetricName": "tma_fp_assist", "MetricThreshold": "tma_fp_assist > 0.02", @@ -314,7 +314,7 @@ }, { "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block", - "MetricExpr": "100 * LD_BLOCKS.4K_ALIAS / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricExpr": "100 * cpu_atom@LD_BLOCKS.4K_ALIAS@ / MEM_UOPS_RETIRED.ALL_LOADS", "MetricName": "tma_info_address_alias_blocks", "Unit": "cpu_atom" }, @@ -334,14 +334,14 @@ }, { "BriefDescription": "", - "MetricExpr": "CPU_CLK_UNHALTED.CORE", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@", "MetricGroup": " ", "MetricName": "tma_info_clks", "Unit": "cpu_atom" }, { "BriefDescription": "", - "MetricExpr": "CPU_CLK_UNHALTED.CORE_P", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE_P@", "MetricGroup": " ", "MetricName": "tma_info_clks_p", "Unit": "cpu_atom" @@ -383,35 +383,35 @@ }, { "BriefDescription": "Percentage of all uops which are FPDiv uops", - "MetricExpr": "100 * UOPS_RETIRED.FPDIV / UOPS_RETIRED.ALL", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / UOPS_RETIRED.ALL", "MetricGroup": " ", "MetricName": "tma_info_fpdiv_uop_ratio", "Unit": "cpu_atom" }, { "BriefDescription": "Percentage of all uops which are IDiv uops", - "MetricExpr": "100 * UOPS_RETIRED.IDIV / UOPS_RETIRED.ALL", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / UOPS_RETIRED.ALL", "MetricGroup": " ", "MetricName": "tma_info_idiv_uop_ratio", "Unit": "cpu_atom" }, { "BriefDescription": "Percent of instruction miss cost that hit in DRAM", - "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / MEM_BOUND_STALLS.IFETCH", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", "MetricGroup": " ", "MetricName": "tma_info_inst_miss_cost_dramhit_percent", "Unit": "cpu_atom" }, { "BriefDescription": "Percent of instruction miss cost that hit in the L2", - "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / MEM_BOUND_STALLS.IFETCH", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", "MetricGroup": " ", "MetricName": "tma_info_inst_miss_cost_l2hit_percent", "Unit": "cpu_atom" }, { "BriefDescription": "Percent of instruction miss cost that hit in the L3", - "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / MEM_BOUND_STALLS.IFETCH", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", "MetricGroup": " ", "MetricName": "tma_info_inst_miss_cost_l3hit_percent", "Unit": "cpu_atom" @@ -439,7 +439,7 @@ }, { "BriefDescription": "Instructions per Far Branch", - "MetricExpr": "INST_RETIRED.ANY / (BR_INST_RETIRED.FAR_BRANCH / 2)", + "MetricExpr": "INST_RETIRED.ANY / (cpu_atom@BR_INST_RETIRED.FAR_BRANCH@ / 2)", "MetricGroup": " ", "MetricName": "tma_info_ipfarbranch", "Unit": "cpu_atom" @@ -453,7 +453,7 @@ }, { "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken", - "MetricExpr": "INST_RETIRED.ANY / (BR_MISP_RETIRED.COND - BR_MISP_RETIRED.COND_TAKEN)", + "MetricExpr": "INST_RETIRED.ANY / (cpu_atom@BR_MISP_RETIRED.COND@ - cpu_atom@BR_MISP_RETIRED.COND_TAKEN@)", "MetricName": "tma_info_ipmisp_cond_ntaken", "Unit": "cpu_atom" }, @@ -498,20 +498,20 @@ }, { "BriefDescription": "Percentage of total non-speculative loads that are splits", - "MetricExpr": "100 * MEM_UOPS_RETIRED.SPLIT_LOADS / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.SPLIT_LOADS@ / MEM_UOPS_RETIRED.ALL_LOADS", "MetricName": "tma_info_load_splits", "Unit": "cpu_atom" }, { "BriefDescription": "load ops retired per 1000 instruction", - "MetricExpr": "1e3 * MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / INST_RETIRED.ANY", "MetricGroup": " ", "MetricName": "tma_info_memloadpki", "Unit": "cpu_atom" }, { "BriefDescription": "Percentage of all uops which are ucode ops", - "MetricExpr": "100 * UOPS_RETIRED.MS / UOPS_RETIRED.ALL", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / UOPS_RETIRED.ALL", "MetricGroup": " ", "MetricName": "tma_info_microcode_uop_ratio", "Unit": "cpu_atom" @@ -525,7 +525,7 @@ }, { "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block", - "MetricExpr": "100 * LD_BLOCKS.DATA_UNKNOWN / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / MEM_UOPS_RETIRED.ALL_LOADS", "MetricName": "tma_info_store_fwd_blocks", "Unit": "cpu_atom" }, @@ -545,7 +545,7 @@ }, { "BriefDescription": "Percentage of all uops which are x87 uops", - "MetricExpr": "100 * UOPS_RETIRED.X87 / UOPS_RETIRED.ALL", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / UOPS_RETIRED.ALL", "MetricGroup": " ", "MetricName": "tma_info_x87_uop_ratio", "Unit": "cpu_atom" @@ -571,7 +571,7 @@ { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / tma_info_clks - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / tma_info_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.1", @@ -580,7 +580,7 @@ }, { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / tma_info_clks - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / tma_info_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.1", @@ -589,7 +589,7 @@ }, { "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to load buffer full", - "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.LD_BUF / MEM_SCHEDULER_BLOCK.ALL", + "MetricExpr": "tma_mem_scheduler * cpu_atom@MEM_SCHEDULER_BLOCK.LD_BUF@ / MEM_SCHEDULER_BLOCK.ALL", "MetricGroup": "TopdownL4;tma_L4_group;tma_mem_scheduler_group", "MetricName": "tma_ld_buffer", "MetricThreshold": "tma_ld_buffer > 0.05", @@ -617,7 +617,7 @@ }, { "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads.", - "MetricExpr": "min(tma_backend_bound, LD_HEAD.ANY_AT_RET / tma_info_clks + tma_store_bound)", + "MetricExpr": "min(cpu_atom@TOPDOWN_BE_BOUND.ALL@ / tma_info_slots, cpu_atom@LD_HEAD.ANY_AT_RET@ / tma_info_clks + tma_store_bound)", "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2", @@ -627,7 +627,7 @@ }, { "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory ordering.", - "MetricExpr": "tma_nuke * (MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.SLOW)", + "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_atom@MACHINE_CLEARS.SLOW@)", "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group", "MetricName": "tma_memory_ordering", "MetricThreshold": "tma_memory_ordering > 0.02", @@ -636,7 +636,7 @@ }, { "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)", - "MetricExpr": "tma_microcode_sequencer", + "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_ms_uops", "MetricThreshold": "tma_ms_uops > 0.05", @@ -692,7 +692,7 @@ }, { "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.", - "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS - UOPS_RETIRED.FPDIV) / tma_info_slots", + "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@ - cpu_atom@UOPS_RETIRED.FPDIV@) / tma_info_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group", "MetricName": "tma_other_ret", "MetricThreshold": "tma_other_ret > 0.3", @@ -701,7 +701,7 @@ }, { "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to page faults.", - "MetricExpr": "tma_nuke * (MACHINE_CLEARS.PAGE_FAULT / MACHINE_CLEARS.SLOW)", + "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.PAGE_FAULT@ / cpu_atom@MACHINE_CLEARS.SLOW@)", "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group", "MetricName": "tma_page_fault", "MetricThreshold": "tma_page_fault > 0.02", @@ -758,7 +758,7 @@ }, { "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to RSV full relative", - "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.RSV / MEM_SCHEDULER_BLOCK.ALL", + "MetricExpr": "tma_mem_scheduler * cpu_atom@MEM_SCHEDULER_BLOCK.RSV@ / MEM_SCHEDULER_BLOCK.ALL", "MetricGroup": "TopdownL4;tma_L4_group;tma_mem_scheduler_group", "MetricName": "tma_rsv", "MetricThreshold": "tma_rsv > 0.05", @@ -776,7 +776,7 @@ }, { "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to SMC.", - "MetricExpr": "tma_nuke * (MACHINE_CLEARS.SMC / MACHINE_CLEARS.SLOW)", + "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.SMC@ / cpu_atom@MACHINE_CLEARS.SLOW@)", "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group", "MetricName": "tma_smc", "MetricThreshold": "tma_smc > 0.02", @@ -812,7 +812,7 @@ }, { "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full.", - "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)", + "MetricExpr": "tma_mem_scheduler * (cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@MEM_SCHEDULER_BLOCK.ALL@)", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.1", @@ -830,7 +830,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -849,7 +849,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.", - "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_slots", + "MetricExpr": "63 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_avx_assists", "MetricThreshold": "tma_avx_assists > 0.1", @@ -858,7 +858,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -880,7 +880,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -911,7 +911,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -922,7 +922,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(25 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 24 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(25 * tma_info_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_average_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -944,7 +944,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "24 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "24 * tma_info_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -975,7 +975,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks", + "MetricExpr": "cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@ / tma_info_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -985,7 +985,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", @@ -1005,7 +1005,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_LOAD_MISSES.WALK_ACTIVE@, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1015,7 +1015,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_STORE_MISSES.WALK_ACTIVE@) / tma_info_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1025,7 +1025,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "28 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "28 * tma_info_average_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1056,7 +1056,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -1088,7 +1088,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", - "MetricExpr": "30 * ASSISTS.FP / tma_info_slots", + "MetricExpr": "30 * cpu_core@ASSISTS.FP@ / tma_info_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_fp_assists", "MetricThreshold": "tma_fp_assists > 0.1", @@ -1118,7 +1118,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1128,7 +1128,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / (tma_retiring * tma_info_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1138,7 +1138,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -1149,7 +1149,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", @@ -1159,7 +1159,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1213,7 +1213,7 @@ }, { "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", + "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.COND@ + 3 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + (cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@)) / tma_info_slots)", "MetricGroup": "Ret;tma_issueBC", "MetricName": "tma_info_branching_overhead", "MetricThreshold": "tma_info_branching_overhead > 10", @@ -1222,21 +1222,21 @@ }, { "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "tma_info_callret", "Unit": "cpu_core" }, { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", "MetricGroup": "Pipeline", "MetricName": "tma_info_clks", "Unit": "cpu_core" }, { "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", "MetricGroup": "Fed;MemoryTLB", "MetricName": "tma_info_code_stlb_mpki", "Unit": "cpu_core" @@ -1266,7 +1266,7 @@ }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@", "MetricGroup": "SMT", "MetricName": "tma_info_core_clks", "Unit": "cpu_core" @@ -1309,7 +1309,7 @@ }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", + "MetricExpr": "IDQ.DSB_UOPS / cpu_core@UOPS_ISSUED.ANY@", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_dsb_coverage", "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 6 > 0.35", @@ -1350,7 +1350,7 @@ }, { "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_fb_hpki", "Unit": "cpu_core" @@ -1365,7 +1365,7 @@ { "BriefDescription": "Floating Point Operations Per Cycle", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_flopc", "Unit": "cpu_core" @@ -1373,7 +1373,7 @@ { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@FP_ARITH_DISPATCHED.PORT_0@ + cpu_core@FP_ARITH_DISPATCHED.PORT_1@ + cpu_core@FP_ARITH_DISPATCHED.PORT_5@) / (2 * tma_info_core_clks)", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_fp_arith_utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common).", @@ -1381,7 +1381,7 @@ }, { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_gflops", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.", @@ -1405,7 +1405,7 @@ }, { "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@)", "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", "MetricName": "tma_info_ilp", "Unit": "cpu_core" @@ -1421,7 +1421,7 @@ }, { "BriefDescription": "Total number of retired Instructions", - "MetricExpr": "INST_RETIRED.ANY", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@", "MetricGroup": "Summary;TmaL1;tma_L1_group", "MetricName": "tma_info_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST", @@ -1438,7 +1438,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_iparith_avx128", "MetricThreshold": "tma_info_iparith_avx128 < 10", @@ -1447,7 +1447,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_iparith_avx256", "MetricThreshold": "tma_info_iparith_avx256 < 10", @@ -1514,7 +1514,7 @@ }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricExpr": "INST_RETIRED.ANY / cpu_core@BR_INST_RETIRED.FAR_BRANCH@u", "MetricGroup": "Branches;OS", "MetricName": "tma_info_ipfarbranch", "MetricThreshold": "tma_info_ipfarbranch < 1e6", @@ -1522,7 +1522,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", + "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_ipflop", "MetricThreshold": "tma_info_ipflop < 10", @@ -1610,14 +1610,14 @@ }, { "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches", "MetricName": "tma_info_jump", "Unit": "cpu_core" }, { "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / cpu_core@INST_RETIRED.ANY_P@k", "MetricGroup": "OS", "MetricName": "tma_info_kernel_cpi", "Unit": "cpu_core" @@ -1632,7 +1632,7 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", "MetricName": "tma_info_l1d_cache_fill_bw", "Unit": "cpu_core" @@ -1646,21 +1646,21 @@ }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_l1mpki", "Unit": "cpu_core" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_l1mpki_load", "Unit": "cpu_core" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", "MetricName": "tma_info_l2_cache_fill_bw", "Unit": "cpu_core" @@ -1674,56 +1674,56 @@ }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_l2hpki_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_l2hpki_load", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", "MetricName": "tma_info_l2mpki", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", "MetricName": "tma_info_l2mpki_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / INST_RETIRED.ANY", "MetricGroup": "IcMiss", "MetricName": "tma_info_l2mpki_code", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / INST_RETIRED.ANY", "MetricGroup": "IcMiss", "MetricName": "tma_info_l2mpki_code_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_l2mpki_load", "Unit": "cpu_core" }, { "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW;Offcore", "MetricName": "tma_info_l3_cache_access_bw", "Unit": "cpu_core" @@ -1737,7 +1737,7 @@ }, { "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", "MetricName": "tma_info_l3_cache_fill_bw", "Unit": "cpu_core" @@ -1751,7 +1751,7 @@ }, { "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", "MetricName": "tma_info_l3mpki", "Unit": "cpu_core" @@ -1786,14 +1786,14 @@ }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@DTLB_LOAD_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", "MetricName": "tma_info_load_stlb_mpki", "Unit": "cpu_core" }, { "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", - "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", + "MetricExpr": "LSD.UOPS / cpu_core@UOPS_ISSUED.ANY@", "MetricGroup": "Fed;LSD", "MetricName": "tma_info_lsd_coverage", "Unit": "cpu_core" @@ -1877,7 +1877,7 @@ }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@ITLB_MISSES.WALK_PENDING@ + cpu_core@DTLB_LOAD_MISSES.WALK_PENDING@ + cpu_core@DTLB_STORE_MISSES.WALK_PENDING@) / (4 * tma_info_core_clks)", "MetricGroup": "Mem;MemoryTLB", "MetricName": "tma_info_page_walks_utilization", "MetricThreshold": "tma_info_page_walks_utilization > 0.5", @@ -1893,21 +1893,21 @@ }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "TOPDOWN.SLOTS", + "MetricExpr": "cpu_core@TOPDOWN.SLOTS@", "MetricGroup": "TmaL1;tma_L1_group", "MetricName": "tma_info_slots", "Unit": "cpu_core" }, { "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", + "MetricExpr": "(tma_info_slots / (cpu_core@TOPDOWN.SLOTS@ / 2) if #SMT_on else 1)", "MetricGroup": "SMT;TmaL1;tma_L1_group", "MetricName": "tma_info_slots_utilization", "Unit": "cpu_core" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", + "MetricExpr": "(1 - cpu_core@CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE@ / cpu_core@CPU_CLK_UNHALTED.REF_DISTRIBUTED@ if #SMT_on else 0)", "MetricGroup": "SMT", "MetricName": "tma_info_smt_2t_utilization", "Unit": "cpu_core" @@ -1921,7 +1921,7 @@ }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", "MetricName": "tma_info_store_stlb_mpki", "Unit": "cpu_core" @@ -1969,7 +1969,7 @@ }, { "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@) / (tma_retiring * tma_info_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_128b", "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1979,7 +1979,7 @@ }, { "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_256b", "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1999,7 +1999,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2010,7 +2010,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", + "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2020,7 +2020,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2030,7 +2030,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "9 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "9 * tma_info_average_frequency * cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2090,7 +2090,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(16 * max(0, cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ - cpu_core@L2_RQSTS.ALL_RFO@) + cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@ * (10 * cpu_core@L2_RQSTS.RFO_HIT@ + min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@))) / tma_info_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2100,7 +2100,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", - "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", @@ -2121,7 +2121,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2131,7 +2131,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2141,7 +2141,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "min(tma_backend_bound, LD_HEAD.ANY_AT_RET / tma_info_clks + tma_store_bound)", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -2152,7 +2152,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_clks", + "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -2162,7 +2162,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * cpu_core@MEM_UOP_RETIRED.ANY@ / (tma_retiring * tma_info_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6", @@ -2181,7 +2181,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -2191,7 +2191,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", @@ -2201,7 +2201,7 @@ }, { "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", - "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_clks", + "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", @@ -2211,7 +2211,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_slots / UOPS_ISSUED.ANY) / tma_info_clks", + "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_slots / cpu_core@UOPS_ISSUED.ANY@) / tma_info_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -2221,7 +2221,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", - "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", @@ -2231,7 +2231,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -2252,7 +2252,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", - "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_slots", + "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_page_faults", "MetricThreshold": "tma_page_faults > 0.05", @@ -2292,7 +2292,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_clks)", + "MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -2302,7 +2302,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_clks", + "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -2342,7 +2342,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2382,7 +2382,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_load_miss_real_latency * cpu_core@LD_BLOCKS.NO_SR@ / tma_info_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2402,7 +2402,7 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_clks", + "MetricExpr": "(cpu_core@XQ.FULL_CYCLES@ + cpu_core@L1D_PEND_MISS.L2_STALLS@) / tma_info_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2422,7 +2422,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2432,7 +2432,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(cpu_core@MEM_STORE_RETIRED.L2_HIT@ * 10 * (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) + (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) * min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@)) / tma_info_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2442,7 +2442,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_4_9@ + cpu_core@UOPS_DISPATCHED.PORT_7_8@) / (4 * tma_info_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -2470,7 +2470,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", + "MetricExpr": "9 * cpu_core@OCR.STREAMING_WR.ANY_RESPONSE@ / tma_info_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2490,7 +2490,7 @@ }, { "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD", + "MetricExpr": "tma_retiring * cpu_core@UOPS_EXECUTED.X87@ / UOPS_EXECUTED.THREAD", "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_x87_use", "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json index 0402adbf7d92..f4b3c3883643 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json @@ -193,7 +193,7 @@ { "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / tma_info_clks - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / tma_info_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_clks, 0) * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1", @@ -480,7 +480,7 @@ { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / tma_info_clks - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / tma_info_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_clks, 0) * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.1", @@ -488,7 +488,7 @@ }, { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / tma_info_clks - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / tma_info_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_clks, 0) * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.1", -- cgit v1.2.3 From d6b7dd1107ee24c68f8540f34b0a0483ed5fac07 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:48 -0700 Subject: perf jevents: Don't rewrite metrics across PMUs Don't rewrite metrics across PMUs as the result events likely won't be found. Identify metrics with a pair of PMU name and metric name. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-42-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.py | 4 ++-- tools/perf/pmu-events/metric.py | 28 +++++++++++++++++----------- tools/perf/pmu-events/metric_test.py | 6 +++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index b18dd2fcbf04..487ff01baf1b 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -391,11 +391,11 @@ def read_json_events(path: str, topic: str) -> Sequence[JsonEvent]: except BaseException as err: print(f"Exception processing {path}") raise - metrics: list[Tuple[str, metric.Expression]] = [] + metrics: list[Tuple[str, str, metric.Expression]] = [] for event in events: event.topic = topic if event.metric_name and '-' not in event.metric_name: - metrics.append((event.metric_name, event.metric_expr)) + metrics.append((event.pmu, event.metric_name, event.metric_expr)) updates = metric.RewriteMetricsInTermsOfOthers(metrics) if updates: for event in events: diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py index 8ec0ba884673..af58b74d1644 100644 --- a/tools/perf/pmu-events/metric.py +++ b/tools/perf/pmu-events/metric.py @@ -552,28 +552,34 @@ def ParsePerfJson(orig: str) -> Expression: return _Constify(eval(compile(parsed, orig, 'eval'))) -def RewriteMetricsInTermsOfOthers(metrics: List[Tuple[str, Expression]] - )-> Dict[str, Expression]: +def RewriteMetricsInTermsOfOthers(metrics: List[Tuple[str, str, Expression]] + )-> Dict[Tuple[str, str], Expression]: """Shorten metrics by rewriting in terms of others. Args: - metrics (list): pairs of metric names and their expressions. + metrics (list): pmus, metric names and their expressions. Returns: - Dict: mapping from a metric name to a shortened expression. + Dict: mapping from a pmu, metric name pair to a shortened expression. """ - updates: Dict[str, Expression] = dict() - for outer_name, outer_expression in metrics: + updates: Dict[Tuple[str, str], Expression] = dict() + for outer_pmu, outer_name, outer_expression in metrics: + if outer_pmu is None: + outer_pmu = 'cpu' updated = outer_expression while True: - for inner_name, inner_expression in metrics: + for inner_pmu, inner_name, inner_expression in metrics: + if inner_pmu is None: + inner_pmu = 'cpu' + if inner_pmu.lower() != outer_pmu.lower(): + continue if inner_name.lower() == outer_name.lower(): continue - if inner_name in updates: - inner_expression = updates[inner_name] + if (inner_pmu, inner_name) in updates: + inner_expression = updates[(inner_pmu, inner_name)] updated = updated.Substitute(inner_name, inner_expression) if updated.Equals(outer_expression): break - if outer_name in updates and updated.Equals(updates[outer_name]): + if (outer_pmu, outer_name) in updates and updated.Equals(updates[(outer_pmu, outer_name)]): break - updates[outer_name] = updated + updates[(outer_pmu, outer_name)] = updated return updates diff --git a/tools/perf/pmu-events/metric_test.py b/tools/perf/pmu-events/metric_test.py index 40a3c7d8b2bc..ee22ff43ddd7 100755 --- a/tools/perf/pmu-events/metric_test.py +++ b/tools/perf/pmu-events/metric_test.py @@ -158,9 +158,9 @@ class TestMetricExpressions(unittest.TestCase): def test_RewriteMetricsInTermsOfOthers(self): Expression.__eq__ = lambda e1, e2: e1.Equals(e2) - before = [('m1', ParsePerfJson('a + b + c + d')), - ('m2', ParsePerfJson('a + b + c'))] - after = {'m1': ParsePerfJson('m2 + d')} + before = [('cpu', 'm1', ParsePerfJson('a + b + c + d')), + ('cpu', 'm2', ParsePerfJson('a + b + c'))] + after = {('cpu', 'm1'): ParsePerfJson('m2 + d')} self.assertEqual(RewriteMetricsInTermsOfOthers(before), after) Expression.__eq__ = None -- cgit v1.2.3 From 8a4859c50fb79fcbbf74963162389b1d3a87e484 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:49 -0700 Subject: perf metrics: Be PMU specific in event match Ids/events from a metric are turned into an event string and parsed; setup_metric_events matches the id back to the parsed evsel. With hybrid the same event may exist on both PMUs with the same name and be being used by metrics at the same time. A metric on cpu_core therefore shouldn't match against evsels on cpu_atom, or the metric will compute the wrong value. Make the matching sensitive to the PMU being parsed. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-43-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 103a672bb132..7de721e9d895 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -274,7 +274,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, const char *metric_id; struct evsel *ev; size_t ids_size, matched_events, i; - bool all_pmus = !strcmp(pmu, "all"); + bool all_pmus = !strcmp(pmu, "all") || !perf_pmu__is_hybrid(pmu); *out_metric_events = NULL; ids_size = hashmap__size(ids); @@ -287,7 +287,10 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, evlist__for_each_entry(metric_evlist, ev) { struct expr_id_data *val_ptr; - if (!all_pmus && strcmp(ev->pmu_name, pmu)) + /* Don't match events for the wrong hybrid PMU. */ + if (!all_pmus && ev->pmu_name && + perf_pmu__is_hybrid(ev->pmu_name) && + strcmp(ev->pmu_name, pmu)) continue; /* * Check for duplicate events with the same name. For @@ -304,6 +307,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, * about this event. */ if (hashmap__find(ids, metric_id, &val_ptr)) { + pr_debug("Matched metric-id %s to %s\n", metric_id, evsel__name(ev)); metric_events[matched_events++] = ev; if (matched_events >= ids_size) @@ -1592,7 +1596,7 @@ static int parse_groups(struct evlist *perf_evlist, ret = setup_metric_events(fake_pmu ? "all" : m->pmu, m->pctx->ids, metric_evlist, &metric_events); if (ret) { - pr_debug("Cannot resolve IDs for %s: %s\n", + pr_err("Cannot resolve IDs for %s: %s\n", m->metric_name, m->metric_expr); goto out; } -- cgit v1.2.3 From 718eabe1f329acedf1470aed67632d65dca5088c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:50 -0700 Subject: perf stat: Don't disable TopdownL1 metric on hybrid Now that hybrid bugs are fixed sufficient to run TopdownL1 metrics, don't implicitly disable them for hybrid. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-44-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e18b3239d42a..bc45cee3f77c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1900,12 +1900,7 @@ static int add_default_attributes(void) * Add TopdownL1 metrics if they exist. To minimize * multiplexing, don't request threshold computation. */ - /* - * TODO: TopdownL1 is disabled on hybrid CPUs to avoid a crashes - * caused by exposing latent bugs. This is fixed properly in: - * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ - */ - if (metricgroup__has_metric(pmu, "TopdownL1") && !perf_pmu__has_hybrid()) { + if (metricgroup__has_metric(pmu, "TopdownL1")) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; -- cgit v1.2.3 From 9a1bc9ea01e2e95ed56801ed946b310f5562abfc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:51 -0700 Subject: perf parse-events: Reduce scope of is_event_supported Move to print-events.c and make static. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-45-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 39 --------------------------------------- tools/perf/util/parse-events.h | 2 -- tools/perf/util/print-events.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 41 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2dad88a6bf19..b93264f8a37c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -28,7 +28,6 @@ #include "util/bpf-filter.h" #include "util/util.h" #include "tracepoint.h" -#include "thread_map.h" #define MAX_NAME_LEN 100 @@ -133,44 +132,6 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { }, }; -bool is_event_supported(u8 type, u64 config) -{ - bool ret = true; - int open_return; - struct evsel *evsel; - struct perf_event_attr attr = { - .type = type, - .config = config, - .disabled = 1, - }; - struct perf_thread_map *tmap = thread_map__new_by_tid(0); - - if (tmap == NULL) - return false; - - evsel = evsel__new(&attr); - if (evsel) { - open_return = evsel__open(evsel, NULL, tmap); - ret = open_return >= 0; - - if (open_return == -EACCES) { - /* - * This happens if the paranoid value - * /proc/sys/kernel/perf_event_paranoid is set to 2 - * Re-run with exclude_kernel set; we don't do that - * by default as some ARM machines do not support it. - * - */ - evsel->core.attr.exclude_kernel = 1; - ret = evsel__open(evsel, NULL, tmap) >= 0; - } - evsel__delete(evsel); - } - - perf_thread_map__put(tmap); - return ret; -} - const char *event_type(int type) { switch (type) { diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 2a8cafe0ee8f..2021fe145410 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -18,8 +18,6 @@ struct parse_events_error; struct option; struct perf_pmu; -bool is_event_supported(u8 type, u64 config); - const char *event_type(int type); /* Arguments encoded in opt->value. */ diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index d148842b205a..69492cbd6921 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -27,6 +27,7 @@ #include "tracepoint.h" #include "pfm.h" #include "pmu-hybrid.h" +#include "thread_map.h" #define MAX_NAME_LEN 100 @@ -228,6 +229,44 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state) strlist__delete(sdtlist); } +static bool is_event_supported(u8 type, u64 config) +{ + bool ret = true; + int open_return; + struct evsel *evsel; + struct perf_event_attr attr = { + .type = type, + .config = config, + .disabled = 1, + }; + struct perf_thread_map *tmap = thread_map__new_by_tid(0); + + if (tmap == NULL) + return false; + + evsel = evsel__new(&attr); + if (evsel) { + open_return = evsel__open(evsel, NULL, tmap); + ret = open_return >= 0; + + if (open_return == -EACCES) { + /* + * This happens if the paranoid value + * /proc/sys/kernel/perf_event_paranoid is set to 2 + * Re-run with exclude_kernel set; we don't do that + * by default as some ARM machines do not support it. + * + */ + evsel->core.attr.exclude_kernel = 1; + ret = evsel__open(evsel, NULL, tmap) >= 0; + } + evsel__delete(evsel); + } + + perf_thread_map__put(tmap); + return ret; +} + int print_hwcache_events(const struct print_callbacks *print_cb, void *print_state) { struct perf_pmu *pmu = NULL; -- cgit v1.2.3 From 190c6854e9ea0290e6af0ec28ee76c4f90d57cb8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 May 2023 16:27:16 -0300 Subject: perf build: Don't use -ftree-loop-distribute-patterns and -gno-variable-location-views in the python feature test when building with clang-13 Using -ftree-loop-distribute-patterns and -gno-variable-location-views in the python feature test when building with clang-16 results in: 16 80.04 clearlinux:latest : FAIL clang version 16.0.1 clang-16: error: unknown argument: '-gno-variable-location-views' clang-16: error: unknown argument: '-gno-variable-location-views' clang-16: error: optimization flag '-ftree-loop-distribute-patterns' is not supported [-Werror,-Wignored-optimization-argument] clang-16: error: optimization flag '-ftree-loop-distribute-patterns' is not supported [-Werror,-Wignored-optimization-argument] error: command '/usr/sbin/clang' failed with exit code 1 Noticed when building on a docker.io/library/clearlinux:latest container. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index c294db713677..869738fc06c3 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -36,6 +36,10 @@ if cc_is_clang: vars[var] = sub("-fno-semantic-interposition", "", vars[var]) if not clang_has_option("-ffat-lto-objects"): vars[var] = sub("-ffat-lto-objects", "", vars[var]) + if not clang_has_option("-ftree-loop-distribute-patterns"): + vars[var] = sub("-ftree-loop-distribute-patterns", "", vars[var]) + if not clang_has_option("-gno-variable-location-views"): + vars[var] = sub("-gno-variable-location-views", "", vars[var]) from setuptools import setup, Extension -- cgit v1.2.3 From c0d68601cbcefaf69018b3e7aff2687316950ace Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 12 May 2023 23:34:47 -0700 Subject: perf test: Add cputype testing to perf stat Check a bogus PMU fails and that a known PMU succeeds. Limit to PMUs known cpu, cpu_atom and armv8_pmuv3_0 ones. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230513063447.464691-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat.sh | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh index b154fbb15d54..3f1e67795490 100755 --- a/tools/perf/tests/shell/stat.sh +++ b/tools/perf/tests/shell/stat.sh @@ -103,10 +103,54 @@ test_topdown_weak_groups() { echo "Topdown weak groups test [Success]" } +test_cputype() { + # Test --cputype argument. + echo "cputype test" + + # Bogus PMU should fail. + if perf stat --cputype="123" -e instructions true > /dev/null 2>&1 + then + echo "cputype test [Bogus PMU didn't fail]" + err=1 + return + fi + + # Find a known PMU for cputype. + pmu="" + for i in cpu cpu_atom armv8_pmuv3_0 + do + if test -d "/sys/devices/$i" + then + pmu="$i" + break + fi + if perf stat -e "$i/instructions/" true > /dev/null 2>&1 + then + pmu="$i" + break + fi + done + if test "x$pmu" = "x" + then + echo "cputype test [Skipped known PMU not found]" + return + fi + + # Test running with cputype produces output. + if ! perf stat --cputype="$pmu" -e instructions true 2>&1 | grep -E -q "instructions" + then + echo "cputype test [Failed count missed with given filter]" + err=1 + return + fi + echo "cputype test [Success]" +} + test_default_stat test_stat_record_report test_stat_record_script test_stat_repeat_weak_groups test_topdown_groups test_topdown_weak_groups +test_cputype exit $err -- cgit v1.2.3 From 21a165133c85d072051aa214099ad46a49239c46 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 13 May 2023 22:37:24 +0200 Subject: perf doc: Define man page date when using asciidoctor When building perf documentation with asciidoc, we use "git log" to find the last commit date of each doc source and pass that to asciidoc to use as the man page date. When using asciidoctor, however, the current date is always used instead. Defining perf_date like we do for asciidoc also doesn't work because we're not using DocBook as an intermediate format. The asciidoctor man page backend looks for the variable "docdate", so set that instead. Signed-off-by: Ben Hutchings Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Salvatore Bonaccorso Link: https://lore.kernel.org/r/ZF/1BOahN/i6xbBx@decadent.org.uk Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Makefile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index ba5d942e4c6a..8a7d7078e386 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -250,11 +250,16 @@ $(MAN_HTML): $(OUTPUT)%.html : %.txt $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ mv $@+ $@ +# Generate date from git log of the doc input file +PERF_DATE = $(shell git log -1 --pretty="format:%cd" \ + --date=short --no-show-signature $<) + ifdef USE_ASCIIDOCTOR $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b manpage -d manpage \ - $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ + $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ + -adocdate=$(PERF_DATE) -o $@+ $< && \ mv $@+ $@ endif @@ -266,9 +271,7 @@ $(OUTPUT)%.xml : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b docbook -d manpage \ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ - -aperf_date=$(shell git log -1 --pretty="format:%cd" \ - --date=short --no-show-signature $<) \ - -o $@+ $< && \ + -aperf_date=$(PERF_DATE) -o $@+ $< && \ mv $@+ $@ XSLT = docbook.xsl -- cgit v1.2.3 From 61b3d2107d2aae5dc86bbbbd0c9f6151d694cdc4 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 13 May 2023 22:37:43 +0200 Subject: perf doc: Add support for KBUILD_BUILD_TIMESTAMP When building man pages from a Git checkout, we consistently set the man page date based on when the input was last changed. Otherwise, it defaults to the build time, which is not reproducible. Allow the date to be set through the KBUILD_BUILD_TIMESTAMP variable, as for timestamps in the kernel itself. Signed-off-by: Ben Hutchings Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Salvatore Bonaccorso Link: https://lore.kernel.org/r/ZF/1F1P+b9qZ/vVH@decadent.org.uk Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 8a7d7078e386..4407b106d977 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -250,9 +250,13 @@ $(MAN_HTML): $(OUTPUT)%.html : %.txt $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ mv $@+ $@ -# Generate date from git log of the doc input file -PERF_DATE = $(shell git log -1 --pretty="format:%cd" \ - --date=short --no-show-signature $<) +# Generate date from either KBUILD_BUILD_TIMESTAMP or git log of +# the doc input file +PERF_DATE = $(strip \ + $(if $(KBUILD_BUILD_TIMESTAMP), \ + $(shell date -u -d '$(KBUILD_BUILD_TIMESTAMP)' +%Y-%m-%d), \ + $(shell git log -1 --pretty="format:%cd" \ + --date=short --no-show-signature $<))) ifdef USE_ASCIIDOCTOR $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt -- cgit v1.2.3 From 983034cd0d212b23a63efb48ecc47d55d70ee301 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 May 2023 23:27:23 -0700 Subject: perf annotate: Handle "decq", "incq", "testq", "tzcnt" instructions on x86 I found that the "decq", "incq", "testq", "tzcnt" instructions didn't parse the operands properly. Add them to the "x86__instructions" table to fix the issue. Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230511062725.514752-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/annotate/instructions.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 305872692bfd..5c7bec25fee4 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -35,12 +35,14 @@ static struct ins x86__instructions[] = { { .name = "cs", .ops = &mov_ops, }, { .name = "dec", .ops = &dec_ops, }, { .name = "decl", .ops = &dec_ops, }, + { .name = "decq", .ops = &dec_ops, }, { .name = "divsd", .ops = &mov_ops, }, { .name = "divss", .ops = &mov_ops, }, { .name = "gs", .ops = &mov_ops, }, { .name = "imul", .ops = &mov_ops, }, { .name = "inc", .ops = &dec_ops, }, { .name = "incl", .ops = &dec_ops, }, + { .name = "incq", .ops = &dec_ops, }, { .name = "ja", .ops = &jump_ops, }, { .name = "jae", .ops = &jump_ops, }, { .name = "jb", .ops = &jump_ops, }, @@ -123,6 +125,8 @@ static struct ins x86__instructions[] = { { .name = "test", .ops = &mov_ops, }, { .name = "testb", .ops = &mov_ops, }, { .name = "testl", .ops = &mov_ops, }, + { .name = "testq", .ops = &mov_ops, }, + { .name = "tzcnt", .ops = &mov_ops, }, { .name = "ucomisd", .ops = &mov_ops, }, { .name = "ucomiss", .ops = &mov_ops, }, { .name = "vaddsd", .ops = &mov_ops, }, -- cgit v1.2.3 From 94f0705eee70cb256d21c9abe7ce44ffbe093555 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 May 2023 23:27:24 -0700 Subject: perf annotate: Parse x86 SIB addressing properly When the source argument of the "mov" instruction looks like below, it didn't parse the whole operand and just stopped at the first comma. mov (%rbx,%rax,1),%rcx Fix it by checking the parentheses and move it to the closing one. Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230511062725.514752-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 11992cfe271c..b708bbc49c9e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -558,6 +558,19 @@ static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_sy return -1; *s = '\0'; + + /* + * x86 SIB addressing has something like 0x8(%rax, %rcx, 1) + * then it needs to have the closing parenthesis. + */ + if (strchr(ops->raw, '(')) { + *s = ','; + s = strchr(ops->raw, ')'); + if (s == NULL || s[1] != ',') + return -1; + *++s = '\0'; + } + ops->source.raw = strdup(ops->raw); *s = ','; -- cgit v1.2.3 From 6d491b37e70daeb963e3b589b746d99b8b4b1357 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 May 2023 23:27:25 -0700 Subject: perf annotate browser: Add '<' and '>' keys for navigation hists__find_annotations() allows to move to next or previous symbols for annotation using the arrow keys. But TUI annotate_browser__run() uses the RIGHT key as ENTER to handle jump/call instructions. That makes the navigation to the next function impossible. I'd like to change it back to move the next symbol but I'm afraid if some users get confused. So I added a new pair of keys to handle that. Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230511062725.514752-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 4 +++- tools/perf/ui/browsers/annotate.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 63cdf6ea6f6d..425a7e2fd6fb 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -342,7 +342,7 @@ static void hists__find_annotations(struct hists *hists, notes = symbol__annotation(he->ms.sym); if (notes->src == NULL) { find_next: - if (key == K_LEFT) + if (key == K_LEFT || key == '<') nd = rb_prev(nd); else nd = rb_next(nd); @@ -378,9 +378,11 @@ find_next: return; /* fall through */ case K_RIGHT: + case '>': next = rb_next(nd); break; case K_LEFT: + case '<': next = rb_prev(nd); break; default: diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 12c3ce530e42..70bad42b807b 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -781,9 +781,9 @@ static int annotate_browser__run(struct annotate_browser *browser, ui_browser__help_window(&browser->b, "UP/DOWN/PGUP\n" "PGDN/SPACE Navigate\n" + " Move to prev/next symbol\n" "q/ESC/CTRL+C Exit\n\n" "ENTER Go to target\n" - "ESC Exit\n" "H Go to hottest instruction\n" "TAB/shift+TAB Cycle thru hottest instructions\n" "j Toggle showing jump to target arrows\n" @@ -913,6 +913,8 @@ show_sup_ins: annotation__toggle_full_addr(notes, ms); continue; case K_LEFT: + case '<': + case '>': case K_ESC: case 'q': case CTRL('c'): -- cgit v1.2.3 From 40bf1cb07ee72480dd831d7a13e42728dfad35e2 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 13 May 2023 15:40:00 +0800 Subject: perf ftrace: Flush output after each writing The pager will result stdout in full buffering mode instead of line buffering. We need to make the trace visible timely. Signed-off-by: Changbin Du Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230513074000.733550-1-changbin.du@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 810e3376c7d6..ad2a9ae041f6 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -650,6 +650,8 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace) break; if (fwrite(buf, n, 1, stdout) != 1) break; + /* flush output since stdout is in full buffering mode due to pager */ + fflush(stdout); } } -- cgit v1.2.3 From a9650b7f6fc09d1659d7851384200b8ebec52cb6 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Thu, 27 Apr 2023 15:32:20 -0700 Subject: perf vendor events arm64: Add AmpereOne core PMU events Add JSON files for AmpereOne core PMU events. Reviewed-by: John Garry Signed-off-by: Doug Rady Signed-off-by: Ilkka Koskinen Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230427223220.1068356-1-ilkka@os.amperecomputing.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/ampere/ampereone/branch.json | 17 + .../arch/arm64/ampere/ampereone/bus.json | 32 + .../arch/arm64/ampere/ampereone/cache.json | 104 +++ .../arch/arm64/ampere/ampereone/core-imp-def.json | 698 +++++++++++++++++++++ .../arch/arm64/ampere/ampereone/exception.json | 44 ++ .../arch/arm64/ampere/ampereone/instruction.json | 89 +++ .../arch/arm64/ampere/ampereone/intrinsic.json | 14 + .../arch/arm64/ampere/ampereone/memory.json | 44 ++ .../arch/arm64/ampere/ampereone/pipeline.json | 23 + .../arch/arm64/ampere/ampereone/spe.json | 14 + tools/perf/pmu-events/arch/arm64/mapfile.csv | 1 + 11 files changed, 1080 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/branch.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/bus.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/exception.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/intrinsic.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/memory.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/pipeline.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereone/spe.json diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/branch.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/branch.json new file mode 100644 index 000000000000..c751d57f2e19 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/branch.json @@ -0,0 +1,17 @@ +[ + { + "ArchStdEvent": "BR_IMMED_SPEC" + }, + { + "ArchStdEvent": "BR_RETURN_SPEC" + }, + { + "ArchStdEvent": "BR_INDIRECT_SPEC" + }, + { + "ArchStdEvent": "BR_MIS_PRED" + }, + { + "ArchStdEvent": "BR_PRED" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/bus.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/bus.json new file mode 100644 index 000000000000..8623be121818 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/bus.json @@ -0,0 +1,32 @@ +[ + { + "ArchStdEvent": "CPU_CYCLES" + }, + { + "ArchStdEvent": "BUS_CYCLES" + }, + { + "ArchStdEvent": "BUS_ACCESS_RD" + }, + { + "ArchStdEvent": "BUS_ACCESS_WR" + }, + { + "ArchStdEvent": "BUS_ACCESS_SHARED" + }, + { + "ArchStdEvent": "BUS_ACCESS_NOT_SHARED" + }, + { + "ArchStdEvent": "BUS_ACCESS_NORMAL" + }, + { + "ArchStdEvent": "BUS_ACCESS_PERIPH" + }, + { + "ArchStdEvent": "BUS_ACCESS" + }, + { + "ArchStdEvent": "CNT_CYCLES" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json new file mode 100644 index 000000000000..fc0633054211 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/cache.json @@ -0,0 +1,104 @@ +[ + { + "ArchStdEvent": "L1D_CACHE_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_WR" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_INVAL" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_RD" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_RD" + }, + { + "ArchStdEvent": "L2D_CACHE_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_RD" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_WB_VICTIM" + }, + { + "ArchStdEvent": "L2D_CACHE_WB_CLEAN" + }, + { + "ArchStdEvent": "L2D_CACHE_INVAL" + }, + { + "ArchStdEvent": "L1I_CACHE_REFILL" + }, + { + "ArchStdEvent": "L1I_TLB_REFILL" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL" + }, + { + "ArchStdEvent": "L1D_CACHE" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL" + }, + { + "ArchStdEvent": "L1I_CACHE" + }, + { + "ArchStdEvent": "L2D_CACHE" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL" + }, + { + "ArchStdEvent": "L2D_CACHE_WB" + }, + { + "ArchStdEvent": "L1D_TLB" + }, + { + "ArchStdEvent": "L1I_TLB" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL" + }, + { + "ArchStdEvent": "L2I_TLB_REFILL" + }, + { + "ArchStdEvent": "L2D_TLB" + }, + { + "ArchStdEvent": "L2I_TLB" + }, + { + "ArchStdEvent": "DTLB_WALK" + }, + { + "ArchStdEvent": "ITLB_WALK" + }, + { + "ArchStdEvent": "L1D_CACHE_LMISS_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_LMISS" + }, + { + "ArchStdEvent": "L1I_CACHE_LMISS" + }, + { + "ArchStdEvent": "L2D_CACHE_LMISS_RD" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json new file mode 100644 index 000000000000..95c30243f2b2 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json @@ -0,0 +1,698 @@ +[ + { + "PublicDescription": "Level 2 prefetch requests, refilled to L2 cache", + "EventCode": "0x10A", + "EventName": "L2_PREFETCH_REFILL", + "BriefDescription": "Level 2 prefetch requests, refilled to L2 cache" + }, + { + "PublicDescription": "Level 2 prefetch requests, late", + "EventCode": "0x10B", + "EventName": "L2_PREFETCH_UPGRADE", + "BriefDescription": "Level 2 prefetch requests, late" + }, + { + "PublicDescription": "Predictable branch speculatively executed that hit any level of BTB", + "EventCode": "0x110", + "EventName": "BPU_HIT_BTB", + "BriefDescription": "Predictable branch speculatively executed that hit any level of BTB" + }, + { + "PublicDescription": "Predictable conditional branch speculatively executed that hit any level of BTB", + "EventCode": "0x111", + "EventName": "BPU_CONDITIONAL_BRANCH_HIT_BTB", + "BriefDescription": "Predictable conditional branch speculatively executed that hit any level of BTB" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor", + "EventCode": "0x112", + "EventName": "BPU_HIT_INDIRECT_PREDICTOR", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor", + "EventCode": "0x113", + "EventName": "BPU_HIT_RSB", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor" + }, + { + "PublicDescription": "Predictable unconditional branch speculatively executed that did not hit any level of BTB", + "EventCode": "0x114", + "EventName": "BPU_UNCONDITIONAL_BRANCH_MISS_BTB", + "BriefDescription": "Predictable unconditional branch speculatively executed that did not hit any level of BTB" + }, + { + "PublicDescription": "Predictable branch speculatively executed, unpredicted", + "EventCode": "0x115", + "EventName": "BPU_BRANCH_NO_HIT", + "BriefDescription": "Predictable branch speculatively executed, unpredicted" + }, + { + "PublicDescription": "Predictable branch speculatively executed that hit any level of BTB that mispredict", + "EventCode": "0x116", + "EventName": "BPU_HIT_BTB_AND_MISPREDICT", + "BriefDescription": "Predictable branch speculatively executed that hit any level of BTB that mispredict" + }, + { + "PublicDescription": "Predictable conditional branch speculatively executed that hit any level of BTB that (direction) mispredict", + "EventCode": "0x117", + "EventName": "BPU_CONDITIONAL_BRANCH_HIT_BTB_AND_MISPREDICT", + "BriefDescription": "Predictable conditional branch speculatively executed that hit any level of BTB that (direction) mispredict" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor that mispredict", + "EventCode": "0x118", + "EventName": "BPU_INDIRECT_BRANCH_HIT_BTB_AND_MISPREDICT", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor that mispredict" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor that mispredict", + "EventCode": "0x119", + "EventName": "BPU_HIT_RSB_AND_MISPREDICT", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor that mispredict" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the overflow/underflow return predictor that mispredict", + "EventCode": "0x11a", + "EventName": "BPU_MISS_RSB_AND_MISPREDICT", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the overflow/underflow return predictor that mispredict" + }, + { + "PublicDescription": "Predictable branch speculatively executed, unpredicted, that mispredict", + "EventCode": "0x11b", + "EventName": "BPU_NO_PREDICTION_MISPREDICT", + "BriefDescription": "Predictable branch speculatively executed, unpredicted, that mispredict" + }, + { + "PublicDescription": "Predictable branch speculatively executed, unpredicted, that mispredict", + "EventCode": "0x11c", + "EventName": "BPU_BTB_UPDATE", + "BriefDescription": "Predictable branch speculatively executed, unpredicted, that mispredict" + }, + { + "PublicDescription": "Count predict pipe stalls due to speculative return address predictor full", + "EventCode": "0x11d", + "EventName": "BPU_RSB_FULL_STALL", + "BriefDescription": "Count predict pipe stalls due to speculative return address predictor full" + }, + { + "PublicDescription": "Macro-ops speculatively decoded", + "EventCode": "0x11f", + "EventName": "ICF_INST_SPEC_DECODE", + "BriefDescription": "Macro-ops speculatively decoded" + }, + { + "PublicDescription": "Flushes", + "EventCode": "0x120", + "EventName": "GPC_FLUSH", + "BriefDescription": "Flushes" + }, + { + "PublicDescription": "Flushes due to memory hazards", + "EventCode": "0x121", + "EventName": "BPU_FLUSH_MEM_FAULT", + "BriefDescription": "Flushes due to memory hazards" + }, + { + "PublicDescription": "ETM extout bit 0", + "EventCode": "0x141", + "EventName": "MSC_ETM_EXTOUT0", + "BriefDescription": "ETM extout bit 0" + }, + { + "PublicDescription": "ETM extout bit 1", + "EventCode": "0x142", + "EventName": "MSC_ETM_EXTOUT1", + "BriefDescription": "ETM extout bit 1" + }, + { + "PublicDescription": "ETM extout bit 2", + "EventCode": "0x143", + "EventName": "MSC_ETM_EXTOUT2", + "BriefDescription": "ETM extout bit 2" + }, + { + "PublicDescription": "ETM extout bit 3", + "EventCode": "0x144", + "EventName": "MSC_ETM_EXTOUT3", + "BriefDescription": "ETM extout bit 3" + }, + { + "PublicDescription": "Bus request sn", + "EventCode": "0x156", + "EventName": "L2C_SNOOP", + "BriefDescription": "Bus request sn" + }, + { + "PublicDescription": "L2 TXDAT LCRD blocked", + "EventCode": "0x169", + "EventName": "L2C_DAT_CRD_STALL", + "BriefDescription": "L2 TXDAT LCRD blocked" + }, + { + "PublicDescription": "L2 TXRSP LCRD blocked", + "EventCode": "0x16a", + "EventName": "L2C_RSP_CRD_STALL", + "BriefDescription": "L2 TXRSP LCRD blocked" + }, + { + "PublicDescription": "L2 TXREQ LCRD blocked", + "EventCode": "0x16b", + "EventName": "L2C_REQ_CRD_STALL", + "BriefDescription": "L2 TXREQ LCRD blocked" + }, + { + "PublicDescription": "Early mispredict", + "EventCode": "0xD100", + "EventName": "ICF_EARLY_MIS_PRED", + "BriefDescription": "Early mispredict" + }, + { + "PublicDescription": "FEQ full cycles", + "EventCode": "0xD101", + "EventName": "ICF_FEQ_FULL", + "BriefDescription": "FEQ full cycles" + }, + { + "PublicDescription": "Instruction FIFO Full", + "EventCode": "0xD102", + "EventName": "ICF_INST_FIFO_FULL", + "BriefDescription": "Instruction FIFO Full" + }, + { + "PublicDescription": "L1I TLB miss", + "EventCode": "0xD103", + "EventName": "L1I_TLB_MISS", + "BriefDescription": "L1I TLB miss" + }, + { + "PublicDescription": "ICF sent 0 instructions to IDR this cycle", + "EventCode": "0xD104", + "EventName": "ICF_STALL", + "BriefDescription": "ICF sent 0 instructions to IDR this cycle" + }, + { + "PublicDescription": "PC FIFO Full", + "EventCode": "0xD105", + "EventName": "ICF_PC_FIFO_FULL", + "BriefDescription": "PC FIFO Full" + }, + { + "PublicDescription": "Stall due to BOB ID", + "EventCode": "0xD200", + "EventName": "IDR_STALL_BOB_ID", + "BriefDescription": "Stall due to BOB ID" + }, + { + "PublicDescription": "Dispatch stall due to LOB entries", + "EventCode": "0xD201", + "EventName": "IDR_STALL_LOB_ID", + "BriefDescription": "Dispatch stall due to LOB entries" + }, + { + "PublicDescription": "Dispatch stall due to SOB entries", + "EventCode": "0xD202", + "EventName": "IDR_STALL_SOB_ID", + "BriefDescription": "Dispatch stall due to SOB entries" + }, + { + "PublicDescription": "Dispatch stall due to IXU scheduler entries", + "EventCode": "0xD203", + "EventName": "IDR_STALL_IXU_SCHED", + "BriefDescription": "Dispatch stall due to IXU scheduler entries" + }, + { + "PublicDescription": "Dispatch stall due to FSU scheduler entries", + "EventCode": "0xD204", + "EventName": "IDR_STALL_FSU_SCHED", + "BriefDescription": "Dispatch stall due to FSU scheduler entries" + }, + { + "PublicDescription": "Dispatch stall due to ROB entries", + "EventCode": "0xD205", + "EventName": "IDR_STALL_ROB_ID", + "BriefDescription": "Dispatch stall due to ROB entries" + }, + { + "PublicDescription": "Dispatch stall due to flush (6 cycles)", + "EventCode": "0xD206", + "EventName": "IDR_STALL_FLUSH", + "BriefDescription": "Dispatch stall due to flush (6 cycles)" + }, + { + "PublicDescription": "Dispatch stall due to WFI", + "EventCode": "0xD207", + "EventName": "IDR_STALL_WFI", + "BriefDescription": "Dispatch stall due to WFI" + }, + { + "PublicDescription": "Number of SWOB drains triggered by timeout", + "EventCode": "0xD208", + "EventName": "IDR_STALL_SWOB_TIMEOUT", + "BriefDescription": "Number of SWOB drains triggered by timeout" + }, + { + "PublicDescription": "Number of SWOB drains triggered by system register or special-purpose register read-after-write or specific special-purpose register writes that cause SWOB drain", + "EventCode": "0xD209", + "EventName": "IDR_STALL_SWOB_RAW", + "BriefDescription": "Number of SWOB drains triggered by system register or special-purpose register read-after-write or specific special-purpose register writes that cause SWOB drain" + }, + { + "PublicDescription": "Number of SWOB drains triggered by system register write when SWOB full", + "EventCode": "0xD20A", + "EventName": "IDR_STALL_SWOB_FULL", + "BriefDescription": "Number of SWOB drains triggered by system register write when SWOB full" + }, + { + "PublicDescription": "Dispatch stall due to L1 instruction cache miss", + "EventCode": "0xD20B", + "EventName": "STALL_FRONTEND_CACHE", + "BriefDescription": "Dispatch stall due to L1 instruction cache miss" + }, + { + "PublicDescription": "Dispatch stall due to L1 instruction TLB miss", + "EventCode": "0xD20C", + "EventName": "STALL_FRONTEND_TLB", + "BriefDescription": "Dispatch stall due to L1 instruction TLB miss" + }, + { + "PublicDescription": "Dispatch stall due to L1 data cache miss", + "EventCode": "0xD20D", + "EventName": "STALL_BACKEND_CACHE", + "BriefDescription": "Dispatch stall due to L1 data cache miss" + }, + { + "PublicDescription": "Dispatch stall due to L1 data TLB miss", + "EventCode": "0xD20E", + "EventName": "STALL_BACKEND_TLB", + "BriefDescription": "Dispatch stall due to L1 data TLB miss" + }, + { + "PublicDescription": "Dispatch stall due to lack of any core resource", + "EventCode": "0xD20F", + "EventName": "STALL_BACKEND_RESOURCE", + "BriefDescription": "Dispatch stall due to lack of any core resource" + }, + { + "PublicDescription": "Instructions issued by the scheduler", + "EventCode": "0xD300", + "EventName": "IXU_NUM_UOPS_ISSUED", + "BriefDescription": "Instructions issued by the scheduler" + }, + { + "PublicDescription": "Any uop issued was canceled for any reason", + "EventCode": "0xD301", + "EventName": "IXU_ISSUE_CANCEL", + "BriefDescription": "Any uop issued was canceled for any reason" + }, + { + "PublicDescription": "A load wakeup to the scheduler has been cancelled", + "EventCode": "0xD302", + "EventName": "IXU_LOAD_CANCEL", + "BriefDescription": "A load wakeup to the scheduler has been cancelled" + }, + { + "PublicDescription": "The scheduler had to cancel one slow Uop due to resource conflict", + "EventCode": "0xD303", + "EventName": "IXU_SLOW_CANCEL", + "BriefDescription": "The scheduler had to cancel one slow Uop due to resource conflict" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXA", + "EventCode": "0xD304", + "EventName": "IXU_IXA_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXA" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXA Par 0", + "EventCode": "0xD305", + "EventName": "IXU_IXA_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXA Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXA Par 1", + "EventCode": "0xD306", + "EventName": "IXU_IXA_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXA Par 1" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXB", + "EventCode": "0xD307", + "EventName": "IXU_IXB_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXB" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXB Par 0", + "EventCode": "0xD308", + "EventName": "IXU_IXB_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXB Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXB Par 1", + "EventCode": "0xD309", + "EventName": "IXU_IXB_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXB Par 1" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXC", + "EventCode": "0xD30A", + "EventName": "IXU_IXC_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXC" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXC Par 0", + "EventCode": "0xD30B", + "EventName": "IXU_IXC_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXC Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXC Par 1", + "EventCode": "0xD30C", + "EventName": "IXU_IXC_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXC Par 1" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXD", + "EventCode": "0xD30D", + "EventName": "IXU_IXD_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXD" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXD Par 0", + "EventCode": "0xD30E", + "EventName": "IXU_IXD_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXD Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXD Par 1", + "EventCode": "0xD30F", + "EventName": "IXU_IXD_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXD Par 1" + }, + { + "PublicDescription": "Uops issued by the FSU scheduler", + "EventCode": "0xD400", + "EventName": "FSU_ISSUED", + "BriefDescription": "Uops issued by the FSU scheduler" + }, + { + "PublicDescription": "Uops issued by the scheduler on pipe X", + "EventCode": "0xD401", + "EventName": "FSU_FSX_ISSUED", + "BriefDescription": "Uops issued by the scheduler on pipe X" + }, + { + "PublicDescription": "Uops issued by the scheduler on pipe Y", + "EventCode": "0xD402", + "EventName": "FSU_FSY_ISSUED", + "BriefDescription": "Uops issued by the scheduler on pipe Y" + }, + { + "PublicDescription": "Uops issued by the scheduler on pipe Z", + "EventCode": "0xD403", + "EventName": "FSU_FSZ_ISSUED", + "BriefDescription": "Uops issued by the scheduler on pipe Z" + }, + { + "PublicDescription": "Uops canceled (load cancels)", + "EventCode": "0xD404", + "EventName": "FSU_CANCEL", + "BriefDescription": "Uops canceled (load cancels)" + }, + { + "PublicDescription": "Count scheduler stalls due to divide/sqrt", + "EventCode": "0xD405", + "EventName": "FSU_DIV_SQRT_STALL", + "BriefDescription": "Count scheduler stalls due to divide/sqrt" + }, + { + "PublicDescription": "Number of SWOB drains", + "EventCode": "0xD500", + "EventName": "GPC_SWOB_DRAIN", + "BriefDescription": "Number of SWOB drains" + }, + { + "PublicDescription": "GPC detected a Breakpoint instruction match", + "EventCode": "0xD501", + "EventName": "BREAKPOINT_MATCH", + "BriefDescription": "GPC detected a Breakpoint instruction match" + }, + { + "PublicDescription": "L1D TLB miss", + "EventCode": "0xD600", + "EventName": "L1D_TLB_MISS", + "BriefDescription": "L1D TLB miss" + }, + { + "PublicDescription": "OFB full cycles", + "EventCode": "0xD601", + "EventName": "OFB_FULL", + "BriefDescription": "OFB full cycles" + }, + { + "PublicDescription": "Load satisified from store forwarded data", + "EventCode": "0xD605", + "EventName": "LD_FROM_ST_FWD", + "BriefDescription": "Load satisified from store forwarded data" + }, + { + "PublicDescription": "L1 prefetcher, load prefetch requests generated", + "EventCode": "0xD606", + "EventName": "L1_PFETCH_LD_GEN", + "BriefDescription": "L1 prefetcher, load prefetch requests generated" + }, + { + "PublicDescription": "L1 prefetcher, load prefetch fills into the L1 cache", + "EventCode": "0xD607", + "EventName": "L1_PFETCH_LD_FILL", + "BriefDescription": "L1 prefetcher, load prefetch fills into the L1 cache" + }, + { + "PublicDescription": "L1 prefetcher, load prefetch to L2 generated", + "EventCode": "0xD608", + "EventName": "L1_PFETCH_L2_REQ", + "BriefDescription": "L1 prefetcher, load prefetch to L2 generated" + }, + { + "PublicDescription": "L1 prefetcher, distance was reset", + "EventCode": "0xD609", + "EventName": "L1_PFETCH_DIST_RST", + "BriefDescription": "L1 prefetcher, distance was reset" + }, + { + "PublicDescription": "L1 prefetcher, distance was increased", + "EventCode": "0xD60A", + "EventName": "L1_PFETCH_DIST_INC", + "BriefDescription": "L1 prefetcher, distance was increased" + }, + { + "PublicDescription": "L1 prefetcher, table entry is trained", + "EventCode": "0xD60B", + "EventName": "L1_PFETCH_ENTRY_TRAINED", + "BriefDescription": "L1 prefetcher, table entry is trained" + }, + { + "PublicDescription": "Store retirement pipe stall", + "EventCode": "0xD60C", + "EventName": "LSU_ST_RETIRE_STALL", + "BriefDescription": "Store retirement pipe stall" + }, + { + "PublicDescription": "LSU detected a Watchpoint data match", + "EventCode": "0xD60D", + "EventName": "WATCHPOINT_MATCH", + "BriefDescription": "LSU detected a Watchpoint data match" + }, + { + "PublicDescription": "L2 pipeline replay", + "EventCode": "0xD700", + "EventName": "L2C_PIPE_REPLAY", + "BriefDescription": "L2 pipeline replay" + }, + { + "PublicDescription": "L2 refill from I-side miss", + "EventCode": "0xD701", + "EventName": "L2C_INST_REFILL", + "BriefDescription": "L2 refill from I-side miss" + }, + { + "PublicDescription": "L2 refill from D-side miss", + "EventCode": "0xD702", + "EventName": "L2C_DATA_REFILL", + "BriefDescription": "L2 refill from D-side miss" + }, + { + "PublicDescription": "L2 prefetcher, load prefetch requests generated", + "EventCode": "0xD703", + "EventName": "L2_PREFETCH_REQ", + "BriefDescription": "L2 prefetcher, load prefetch requests generated" + }, + { + "PublicDescription": "L2D OTB allocate", + "EventCode": "0xD800", + "EventName": "MMU_D_OTB_ALLOC", + "BriefDescription": "L2D OTB allocate" + }, + { + "PublicDescription": "DTLB Translation cache hit on S1L2 walk cache entry", + "EventCode": "0xD801", + "EventName": "MMU_D_TRANS_CACHE_HIT_S1L2_WALK", + "BriefDescription": "DTLB Translation cache hit on S1L2 walk cache entry" + }, + { + "PublicDescription": "DTLB Translation cache hit on S1L1 walk cache entry", + "EventCode": "0xD802", + "EventName": "MMU_D_TRANS_CACHE_HIT_S1L1_WALK", + "BriefDescription": "DTLB Translation cache hit on S1L1 walk cache entry" + }, + { + "PublicDescription": "DTLB Translation cache hit on S1L0 walk cache entry", + "EventCode": "0xD803", + "EventName": "MMU_D_TRANS_CACHE_HIT_S1L0_WALK", + "BriefDescription": "DTLB Translation cache hit on S1L0 walk cache entry" + }, + { + "PublicDescription": "DTLB Translation cache hit on S2L2 walk cache entry", + "EventCode": "0xD804", + "EventName": "MMU_D_TRANS_CACHE_HIT_S2L2_WALK", + "BriefDescription": "DTLB Translation cache hit on S2L2 walk cache entry" + }, + { + "PublicDescription": "DTLB Translation cache hit on S2L1 walk cache entry", + "EventCode": "0xD805", + "EventName": "MMU_D_TRANS_CACHE_HIT_S2L1_WALK", + "BriefDescription": "DTLB Translation cache hit on S2L1 walk cache entry" + }, + { + "PublicDescription": "DTLB Translation cache hit on S2L0 walk cache entry", + "EventCode": "0xD806", + "EventName": "MMU_D_TRANS_CACHE_HIT_S2L0_WALK", + "BriefDescription": "DTLB Translation cache hit on S2L0 walk cache entry" + }, + { + "PublicDescription": "D-side S1 Page walk cache lookup", + "EventCode": "0xD807", + "EventName": "MMU_D_S1_WALK_CACHE_LOOKUP", + "BriefDescription": "D-side S1 Page walk cache lookup" + }, + { + "PublicDescription": "D-side S1 Page walk cache refill", + "EventCode": "0xD808", + "EventName": "MMU_D_S1_WALK_CACHE_REFILL", + "BriefDescription": "D-side S1 Page walk cache refill" + }, + { + "PublicDescription": "D-side S2 Page walk cache lookup", + "EventCode": "0xD809", + "EventName": "MMU_D_S2_WALK_CACHE_LOOKUP", + "BriefDescription": "D-side S2 Page walk cache lookup" + }, + { + "PublicDescription": "D-side S2 Page walk cache refill", + "EventCode": "0xD80A", + "EventName": "MMU_D_S2_WALK_CACHE_REFILL", + "BriefDescription": "D-side S2 Page walk cache refill" + }, + { + "PublicDescription": "D-side Stage1 tablewalk fault", + "EventCode": "0xD80B", + "EventName": "MMU_D_S1_WALK_FAULT", + "BriefDescription": "D-side Stage1 tablewalk fault" + }, + { + "PublicDescription": "D-side Stage2 tablewalk fault", + "EventCode": "0xD80C", + "EventName": "MMU_D_S2_WALK_FAULT", + "BriefDescription": "D-side Stage2 tablewalk fault" + }, + { + "PublicDescription": "D-side Tablewalk steps or descriptor fetches", + "EventCode": "0xD80D", + "EventName": "MMU_D_WALK_STEPS", + "BriefDescription": "D-side Tablewalk steps or descriptor fetches" + }, + { + "PublicDescription": "L2I OTB allocate", + "EventCode": "0xD900", + "EventName": "MMU_I_OTB_ALLOC", + "BriefDescription": "L2I OTB allocate" + }, + { + "PublicDescription": "ITLB Translation cache hit on S1L2 walk cache entry", + "EventCode": "0xD901", + "EventName": "MMU_I_TRANS_CACHE_HIT_S1L2_WALK", + "BriefDescription": "ITLB Translation cache hit on S1L2 walk cache entry" + }, + { + "PublicDescription": "ITLB Translation cache hit on S1L1 walk cache entry", + "EventCode": "0xD902", + "EventName": "MMU_I_TRANS_CACHE_HIT_S1L1_WALK", + "BriefDescription": "ITLB Translation cache hit on S1L1 walk cache entry" + }, + { + "PublicDescription": "ITLB Translation cache hit on S1L0 walk cache entry", + "EventCode": "0xD903", + "EventName": "MMU_I_TRANS_CACHE_HIT_S1L0_WALK", + "BriefDescription": "ITLB Translation cache hit on S1L0 walk cache entry" + }, + { + "PublicDescription": "ITLB Translation cache hit on S2L2 walk cache entry", + "EventCode": "0xD904", + "EventName": "MMU_I_TRANS_CACHE_HIT_S2L2_WALK", + "BriefDescription": "ITLB Translation cache hit on S2L2 walk cache entry" + }, + { + "PublicDescription": "ITLB Translation cache hit on S2L1 walk cache entry", + "EventCode": "0xD905", + "EventName": "MMU_I_TRANS_CACHE_HIT_S2L1_WALK", + "BriefDescription": "ITLB Translation cache hit on S2L1 walk cache entry" + }, + { + "PublicDescription": "ITLB Translation cache hit on S2L0 walk cache entry", + "EventCode": "0xD906", + "EventName": "MMU_I_TRANS_CACHE_HIT_S2L0_WALK", + "BriefDescription": "ITLB Translation cache hit on S2L0 walk cache entry" + }, + { + "PublicDescription": "I-side S1 Page walk cache lookup", + "EventCode": "0xD907", + "EventName": "MMU_I_S1_WALK_CACHE_LOOKUP", + "BriefDescription": "I-side S1 Page walk cache lookup" + }, + { + "PublicDescription": "I-side S1 Page walk cache refill", + "EventCode": "0xD908", + "EventName": "MMU_I_S1_WALK_CACHE_REFILL", + "BriefDescription": "I-side S1 Page walk cache refill" + }, + { + "PublicDescription": "I-side S2 Page walk cache lookup", + "EventCode": "0xD909", + "EventName": "MMU_I_S2_WALK_CACHE_LOOKUP", + "BriefDescription": "I-side S2 Page walk cache lookup" + }, + { + "PublicDescription": "I-side S2 Page walk cache refill", + "EventCode": "0xD90A", + "EventName": "MMU_I_S2_WALK_CACHE_REFILL", + "BriefDescription": "I-side S2 Page walk cache refill" + }, + { + "PublicDescription": "I-side Stage1 tablewalk fault", + "EventCode": "0xD90B", + "EventName": "MMU_I_S1_WALK_FAULT", + "BriefDescription": "I-side Stage1 tablewalk fault" + }, + { + "PublicDescription": "I-side Stage2 tablewalk fault", + "EventCode": "0xD90C", + "EventName": "MMU_I_S2_WALK_FAULT", + "BriefDescription": "I-side Stage2 tablewalk fault" + }, + { + "PublicDescription": "I-side Tablewalk steps or descriptor fetches", + "EventCode": "0xD90D", + "EventName": "MMU_I_WALK_STEPS", + "BriefDescription": "I-side Tablewalk steps or descriptor fetches" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/exception.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/exception.json new file mode 100644 index 000000000000..ada052e19632 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/exception.json @@ -0,0 +1,44 @@ +[ + { + "ArchStdEvent": "EXC_UNDEF" + }, + { + "ArchStdEvent": "EXC_SVC" + }, + { + "ArchStdEvent": "EXC_PABORT" + }, + { + "ArchStdEvent": "EXC_DABORT" + }, + { + "ArchStdEvent": "EXC_IRQ" + }, + { + "ArchStdEvent": "EXC_FIQ" + }, + { + "ArchStdEvent": "EXC_HVC" + }, + { + "ArchStdEvent": "EXC_TRAP_PABORT" + }, + { + "ArchStdEvent": "EXC_TRAP_DABORT" + }, + { + "ArchStdEvent": "EXC_TRAP_OTHER" + }, + { + "ArchStdEvent": "EXC_TRAP_IRQ" + }, + { + "ArchStdEvent": "EXC_TRAP_FIQ" + }, + { + "ArchStdEvent": "EXC_TAKEN" + }, + { + "ArchStdEvent": "EXC_RETURN" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json new file mode 100644 index 000000000000..18d1f2f76a23 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json @@ -0,0 +1,89 @@ +[ + { + "ArchStdEvent": "SW_INCR" + }, + { + "ArchStdEvent": "ST_RETIRED" + }, + { + "ArchStdEvent": "OP_SPEC" + }, + { + "ArchStdEvent": "LD_SPEC" + }, + { + "ArchStdEvent": "ST_SPEC" + }, + { + "ArchStdEvent": "LDST_SPEC" + }, + { + "ArchStdEvent": "DP_SPEC" + }, + { + "ArchStdEvent": "ASE_SPEC" + }, + { + "ArchStdEvent": "VFP_SPEC" + }, + { + "ArchStdEvent": "PC_WRITE_SPEC" + }, + { + "ArchStdEvent": "BR_IMMED_RETIRED" + }, + { + "ArchStdEvent": "BR_RETURN_RETIRED" + }, + { + "ArchStdEvent": "CRYPTO_SPEC" + }, + { + "ArchStdEvent": "ISB_SPEC" + }, + { + "ArchStdEvent": "DSB_SPEC" + }, + { + "ArchStdEvent": "DMB_SPEC" + }, + { + "ArchStdEvent": "RC_LD_SPEC" + }, + { + "ArchStdEvent": "RC_ST_SPEC" + }, + { + "ArchStdEvent": "INST_RETIRED" + }, + { + "ArchStdEvent": "CID_WRITE_RETIRED" + }, + { + "ArchStdEvent": "PC_WRITE_RETIRED" + }, + { + "ArchStdEvent": "INST_SPEC" + }, + { + "ArchStdEvent": "TTBR_WRITE_RETIRED" + }, + { + "ArchStdEvent": "BR_RETIRED" + }, + { + "ArchStdEvent": "BR_MIS_PRED_RETIRED" + }, + { + "ArchStdEvent": "OP_RETIRED" + }, + { + "ArchStdEvent": "OP_SPEC" + }, + { + "PublicDescription": "Operation speculatively executed, NOP", + "EventCode": "0x100", + "EventName": "NOP_SPEC", + "BriefDescription": "Speculatively executed, NOP" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/intrinsic.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/intrinsic.json new file mode 100644 index 000000000000..7ecffb989ae0 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/intrinsic.json @@ -0,0 +1,14 @@ +[ + { + "ArchStdEvent": "LDREX_SPEC" + }, + { + "ArchStdEvent": "STREX_PASS_SPEC" + }, + { + "ArchStdEvent": "STREX_FAIL_SPEC" + }, + { + "ArchStdEvent": "STREX_SPEC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/memory.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/memory.json new file mode 100644 index 000000000000..0711782bfa6b --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/memory.json @@ -0,0 +1,44 @@ +[ + { + "ArchStdEvent": "LD_RETIRED" + }, + { + "ArchStdEvent": "MEM_ACCESS_RD" + }, + { + "ArchStdEvent": "MEM_ACCESS_WR" + }, + { + "ArchStdEvent": "UNALIGNED_LD_SPEC" + }, + { + "ArchStdEvent": "UNALIGNED_ST_SPEC" + }, + { + "ArchStdEvent": "UNALIGNED_LDST_SPEC" + }, + { + "ArchStdEvent": "LD_ALIGN_LAT" + }, + { + "ArchStdEvent": "ST_ALIGN_LAT" + }, + { + "ArchStdEvent": "MEM_ACCESS" + }, + { + "ArchStdEvent": "MEMORY_ERROR" + }, + { + "ArchStdEvent": "LDST_ALIGN_LAT" + }, + { + "ArchStdEvent": "MEM_ACCESS_CHECKED" + }, + { + "ArchStdEvent": "MEM_ACCESS_CHECKED_RD" + }, + { + "ArchStdEvent": "MEM_ACCESS_CHECKED_WR" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/pipeline.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/pipeline.json new file mode 100644 index 000000000000..f9fae15f7555 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/pipeline.json @@ -0,0 +1,23 @@ +[ + { + "ArchStdEvent": "STALL_FRONTEND" + }, + { + "ArchStdEvent": "STALL_BACKEND" + }, + { + "ArchStdEvent": "STALL" + }, + { + "ArchStdEvent": "STALL_SLOT_BACKEND" + }, + { + "ArchStdEvent": "STALL_SLOT_FRONTEND" + }, + { + "ArchStdEvent": "STALL_SLOT" + }, + { + "ArchStdEvent": "STALL_BACKEND_MEM" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/spe.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/spe.json new file mode 100644 index 000000000000..20f2165c85fe --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/spe.json @@ -0,0 +1,14 @@ +[ + { + "ArchStdEvent": "SAMPLE_POP" + }, + { + "ArchStdEvent": "SAMPLE_FEED" + }, + { + "ArchStdEvent": "SAMPLE_FILTRATE" + }, + { + "ArchStdEvent": "SAMPLE_COLLISION" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 9d400785d195..32674ddd2b63 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -41,3 +41,4 @@ 0x00000000460f0010,v1,fujitsu/a64fx,core 0x00000000480fd010,v1,hisilicon/hip08,core 0x00000000500f0000,v1,ampere/emag,core +0x00000000c00fac30,v1,ampere/ampereone,core -- cgit v1.2.3 From 68d12418261090b4f5b8d1b2067d15062e858e01 Mon Sep 17 00:00:00 2001 From: Anup Sharma Date: Fri, 19 May 2023 13:11:24 +0530 Subject: perf test: Add test validating JSON generated by 'perf data convert --to-json' This commit adds support for testing the JSON output generated by the 'perf data' command's conversion to JSON functionality. The test script now includes a step to ensure that the resulting JSON file contains valid data. Changes: V1 -> V2: Added a check for the existence of the result output file. Replaced the usage of jq with json.load for validating the JSON format. Checks using ShellCheck and checkpatch, addressing and resolving warnings. Removed the unnecessary root permission check. Modified the 'perf record' command to avoid requiring root permissions. Committer testing: $ perf test to-json 115: 'perf data convert --to-json' command test : Ok $ perf test -v to-json Couldn't bump rlimit(MEMLOCK), failures may take place when creating BPF maps, etc 115: 'perf data convert --to-json' command test : --- start --- test child forked, pid 1746867 Testing Perf Data Convertion Command to JSON Perf Data Converter Command to JSON [SUCCESS] Validating Perf Data Converted JSON file The file contains valid JSON format [SUCCESS] test child finished with 0 ---- end ---- 'perf data convert --to-json' command test: Ok $ Signed-off-by: Anup Sharma Acked-by: Ian Rogers Link: https://lore.kernel.org/r/ZGcoJBAGlknjsA/n@yoga Tested-by: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Anup Sharma Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Alexander Shishkin Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org [ Fixup indentation to use consistently tabs, not a mixture of spaces and tabs, have 'if ... ; then' on the same line ] Signed-off-by: Arnaldo Carvalho de Melo --- .../tests/shell/test_perf_data_converter_json.sh | 72 ++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100755 tools/perf/tests/shell/test_perf_data_converter_json.sh diff --git a/tools/perf/tests/shell/test_perf_data_converter_json.sh b/tools/perf/tests/shell/test_perf_data_converter_json.sh new file mode 100755 index 000000000000..72ac6c83231c --- /dev/null +++ b/tools/perf/tests/shell/test_perf_data_converter_json.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# 'perf data convert --to-json' command test +# SPDX-License-Identifier: GPL-2.0 + +set -e + +err=0 + +if [ "$PYTHON" = "" ] ; then + if which python3 > /dev/null ; then + PYTHON=python3 + elif which python > /dev/null ; then + PYTHON=python + else + echo Skipping test, python not detected please set environment variable PYTHON. + exit 2 + fi +fi + +perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +result=$(mktemp /tmp/__perf_test.output.json.XXXXX) + +cleanup() +{ + rm -f "${perfdata}" + rm -f "${result}" + trap - exit term int +} + +trap_cleanup() +{ + cleanup + exit ${err} +} +trap trap_cleanup exit term int + +test_json_converter_command() +{ + echo "Testing Perf Data Convertion Command to JSON" + perf record -o "$perfdata" -F 99 -g -- perf test -w noploop > /dev/null 2>&1 + perf data convert --to-json "$result" --force -i "$perfdata" >/dev/null 2>&1 + if [ $(cat "${result}" | wc -l) -gt "0" ] ; then + echo "Perf Data Converter Command to JSON [SUCCESS]" + else + echo "Perf Data Converter Command to JSON [FAILED]" + err=1 + exit + fi +} + +validate_json_format() +{ + echo "Validating Perf Data Converted JSON file" + if [ -f "$result" ] ; then + if $PYTHON -c "import json; json.load(open('$result'))" >/dev/null 2>&1 ; then + echo "The file contains valid JSON format [SUCCESS]" + else + echo "The file does not contain valid JSON format [FAILED]" + err=1 + exit + fi + else + echo "File not found [FAILED]" + err=2 + exit + fi +} + +test_json_converter_command +validate_json_format + +exit ${err} -- cgit v1.2.3 From c04fcf7c8c4dfdcbfca8b8ec3e7e1fcb6d99e3e3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:50 -0700 Subject: perf vendor events intel: Update alderlake events/metrics Update events to v21 including the new event SQ_MISC.BUS_LOCK and improved comments. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/alderlake/adl-metrics.json | 1314 ++++++++++---------- .../perf/pmu-events/arch/x86/alderlake/cache.json | 9 + .../perf/pmu-events/arch/x86/alderlake/memory.json | 6 +- .../arch/x86/alderlaken/adln-metrics.json | 276 ++-- tools/perf/pmu-events/arch/x86/mapfile.csv | 4 +- 5 files changed, 784 insertions(+), 825 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index 840f6f6fc8c5..c9f7e3d4ab08 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -71,7 +71,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -120,7 +120,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions.", - "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_alloc_restriction", "MetricThreshold": "tma_alloc_restriction > 0.1", @@ -129,7 +129,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", - "MetricExpr": "TOPDOWN_BE_BOUND.ALL / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.ALL / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.1", @@ -151,7 +151,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", - "MetricExpr": "(tma_info_slots - (cpu_atom@TOPDOWN_FE_BOUND.ALL@ + cpu_atom@TOPDOWN_BE_BOUND.ALL@ + cpu_atom@TOPDOWN_RETIRING.ALL@)) / tma_info_slots", + "MetricExpr": "(tma_info_core_slots - (cpu_atom@TOPDOWN_FE_BOUND.ALL@ + cpu_atom@TOPDOWN_BE_BOUND.ALL@ + cpu_atom@TOPDOWN_RETIRING.ALL@)) / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -162,7 +162,7 @@ }, { "BriefDescription": "Counts the number of uops that are not from the microsequencer.", - "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@) / tma_info_slots", + "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@) / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_base", "MetricThreshold": "tma_base > 0.6", @@ -172,7 +172,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend", - "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_detect", "MetricThreshold": "tma_branch_detect > 0.05", @@ -182,7 +182,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts.", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.05", @@ -192,7 +192,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.", - "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteer", "MetricThreshold": "tma_branch_resteer > 0.05", @@ -201,7 +201,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).", - "MetricExpr": "TOPDOWN_FE_BOUND.CISC / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.CISC / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_cisc", "MetricThreshold": "tma_cisc > 0.05", @@ -220,7 +220,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.", - "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_decode", "MetricThreshold": "tma_decode > 0.05", @@ -239,7 +239,7 @@ { "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / tma_info_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", + "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1", @@ -248,7 +248,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear classified as a fast nuke due to memory ordering, memory disambiguation and memory renaming.", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_fast_nuke", "MetricThreshold": "tma_fast_nuke > 0.05", @@ -257,7 +257,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", - "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_bandwidth", "MetricThreshold": "tma_fetch_bandwidth > 0.1", @@ -267,7 +267,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", - "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.15", @@ -286,7 +286,7 @@ }, { "BriefDescription": "Counts the number of floating point divide operations per uop.", - "MetricExpr": "UOPS_RETIRED.FPDIV / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.FPDIV / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group", "MetricName": "tma_fpdiv_uops", "MetricThreshold": "tma_fpdiv_uops > 0.2", @@ -295,7 +295,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.", - "MetricExpr": "TOPDOWN_FE_BOUND.ALL / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.ALL / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.2", @@ -305,254 +305,228 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.", - "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05", "ScaleUnit": "100%", "Unit": "cpu_atom" }, - { - "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block", - "MetricExpr": "100 * cpu_atom@LD_BLOCKS.4K_ALIAS@ / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricName": "tma_info_address_alias_blocks", - "Unit": "cpu_atom" - }, - { - "BriefDescription": "Ratio of all branches which mispredict", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": " ", - "MetricName": "tma_info_branch_mispredict_ratio", - "Unit": "cpu_atom" - }, - { - "BriefDescription": "Ratio between Mispredicted branches and unknown branches", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BACLEARS.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_branch_mispredict_to_unknown_branch_ratio", - "Unit": "cpu_atom" - }, { "BriefDescription": "", "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@", - "MetricGroup": " ", - "MetricName": "tma_info_clks", + "MetricName": "tma_info_core_clks", "Unit": "cpu_atom" }, { "BriefDescription": "", "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE_P@", - "MetricGroup": " ", - "MetricName": "tma_info_clks_p", + "MetricName": "tma_info_core_clks_p", "Unit": "cpu_atom" }, { "BriefDescription": "Cycles Per Instruction", - "MetricExpr": "tma_info_clks / INST_RETIRED.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_cpi", + "MetricExpr": "tma_info_core_clks / INST_RETIRED.ANY", + "MetricName": "tma_info_core_cpi", "Unit": "cpu_atom" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": " ", - "MetricName": "tma_info_cpu_utilization", + "BriefDescription": "Instructions Per Cycle", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricName": "tma_info_core_ipc", "Unit": "cpu_atom" }, { - "BriefDescription": "Cycle cost per DRAM hit", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_LOAD_UOPS_RETIRED.DRAM_HIT", - "MetricGroup": " ", - "MetricName": "tma_info_cycles_per_demand_load_dram_hit", + "BriefDescription": "", + "MetricExpr": "5 * tma_info_core_clks", + "MetricName": "tma_info_core_slots", "Unit": "cpu_atom" }, { - "BriefDescription": "Cycle cost per L2 hit", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT", - "MetricGroup": " ", - "MetricName": "tma_info_cycles_per_demand_load_l2_hit", + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.ALL / INST_RETIRED.ANY", + "MetricName": "tma_info_core_upi", "Unit": "cpu_atom" }, { - "BriefDescription": "Cycle cost per LLC hit", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_LOAD_UOPS_RETIRED.L3_HIT", - "MetricGroup": " ", - "MetricName": "tma_info_cycles_per_demand_load_l3_hit", + "BriefDescription": "Percent of instruction miss cost that hit in DRAM", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", + "MetricName": "tma_info_frontend_inst_miss_cost_dramhit_percent", "Unit": "cpu_atom" }, { - "BriefDescription": "Percentage of all uops which are FPDiv uops", - "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_fpdiv_uop_ratio", + "BriefDescription": "Percent of instruction miss cost that hit in the L2", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", + "MetricName": "tma_info_frontend_inst_miss_cost_l2hit_percent", "Unit": "cpu_atom" }, { - "BriefDescription": "Percentage of all uops which are IDiv uops", - "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_idiv_uop_ratio", + "BriefDescription": "Percent of instruction miss cost that hit in the L3", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", + "MetricName": "tma_info_frontend_inst_miss_cost_l3hit_percent", "Unit": "cpu_atom" }, { - "BriefDescription": "Percent of instruction miss cost that hit in DRAM", - "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", - "MetricGroup": " ", - "MetricName": "tma_info_inst_miss_cost_dramhit_percent", + "BriefDescription": "Ratio of all branches which mispredict", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.ALL_BRANCHES", + "MetricName": "tma_info_inst_mix_branch_mispredict_ratio", "Unit": "cpu_atom" }, { - "BriefDescription": "Percent of instruction miss cost that hit in the L2", - "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", - "MetricGroup": " ", - "MetricName": "tma_info_inst_miss_cost_l2hit_percent", + "BriefDescription": "Ratio between Mispredicted branches and unknown branches", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BACLEARS.ANY", + "MetricName": "tma_info_inst_mix_branch_mispredict_to_unknown_branch_ratio", "Unit": "cpu_atom" }, { - "BriefDescription": "Percent of instruction miss cost that hit in the L3", - "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@", - "MetricGroup": " ", - "MetricName": "tma_info_inst_miss_cost_l3hit_percent", + "BriefDescription": "Percentage of all uops which are FPDiv uops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_fpdiv_uop_ratio", "Unit": "cpu_atom" }, { - "BriefDescription": "Instructions per Branch (lower number means higher occurance rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": " ", - "MetricName": "tma_info_ipbranch", + "BriefDescription": "Percentage of all uops which are IDiv uops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_idiv_uop_ratio", "Unit": "cpu_atom" }, { - "BriefDescription": "Instructions Per Cycle", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": " ", - "MetricName": "tma_info_ipc", + "BriefDescription": "Instructions per Branch (lower number means higher occurance rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", + "MetricName": "tma_info_inst_mix_ipbranch", "Unit": "cpu_atom" }, { "BriefDescription": "Instruction per (near) call (lower number means higher occurance rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.CALL", - "MetricGroup": " ", - "MetricName": "tma_info_ipcall", + "MetricName": "tma_info_inst_mix_ipcall", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per Far Branch", "MetricExpr": "INST_RETIRED.ANY / (cpu_atom@BR_INST_RETIRED.FAR_BRANCH@ / 2)", - "MetricGroup": " ", - "MetricName": "tma_info_ipfarbranch", + "MetricName": "tma_info_inst_mix_ipfarbranch", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per Load", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricGroup": " ", - "MetricName": "tma_info_ipload", + "MetricName": "tma_info_inst_mix_ipload", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken", "MetricExpr": "INST_RETIRED.ANY / (cpu_atom@BR_MISP_RETIRED.COND@ - cpu_atom@BR_MISP_RETIRED.COND_TAKEN@)", - "MetricName": "tma_info_ipmisp_cond_ntaken", + "MetricName": "tma_info_inst_mix_ipmisp_cond_ntaken", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was taken", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricName": "tma_info_ipmisp_cond_taken", + "MetricName": "tma_info_inst_mix_ipmisp_cond_taken", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per retired indirect call or jump Branch Misprediction", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricName": "tma_info_ipmisp_indirect", + "MetricName": "tma_info_inst_mix_ipmisp_indirect", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per retired return Branch Misprediction", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RETURN", - "MetricName": "tma_info_ipmisp_ret", + "MetricName": "tma_info_inst_mix_ipmisp_ret", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per retired Branch Misprediction", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": " ", - "MetricName": "tma_info_ipmispredict", + "MetricName": "tma_info_inst_mix_ipmispredict", "Unit": "cpu_atom" }, { "BriefDescription": "Instructions per Store", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", - "MetricGroup": " ", - "MetricName": "tma_info_ipstore", + "MetricName": "tma_info_inst_mix_ipstore", "Unit": "cpu_atom" }, { - "BriefDescription": "Fraction of cycles spent in Kernel mode", - "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@k / CPU_CLK_UNHALTED.CORE", - "MetricGroup": " ", - "MetricName": "tma_info_kernel_utilization", + "BriefDescription": "Percentage of all uops which are ucode ops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_microcode_uop_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of all uops which are x87 uops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_x87_uop_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block", + "MetricExpr": "100 * cpu_atom@LD_BLOCKS.4K_ALIAS@ / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricName": "tma_info_l1_bound_address_alias_blocks", "Unit": "cpu_atom" }, { "BriefDescription": "Percentage of total non-speculative loads that are splits", "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.SPLIT_LOADS@ / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricName": "tma_info_load_splits", + "MetricName": "tma_info_l1_bound_load_splits", "Unit": "cpu_atom" }, { - "BriefDescription": "load ops retired per 1000 instruction", - "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / INST_RETIRED.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_memloadpki", + "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block", + "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricName": "tma_info_l1_bound_store_fwd_blocks", "Unit": "cpu_atom" }, { - "BriefDescription": "Percentage of all uops which are ucode ops", - "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_microcode_uop_ratio", + "BriefDescription": "Cycle cost per DRAM hit", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_LOAD_UOPS_RETIRED.DRAM_HIT", + "MetricName": "tma_info_memory_cycles_per_demand_load_dram_hit", "Unit": "cpu_atom" }, { - "BriefDescription": "", - "MetricExpr": "5 * tma_info_clks", - "MetricGroup": " ", - "MetricName": "tma_info_slots", + "BriefDescription": "Cycle cost per L2 hit", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT", + "MetricName": "tma_info_memory_cycles_per_demand_load_l2_hit", "Unit": "cpu_atom" }, { - "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block", - "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricName": "tma_info_store_fwd_blocks", + "BriefDescription": "Cycle cost per LLC hit", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_LOAD_UOPS_RETIRED.L3_HIT", + "MetricName": "tma_info_memory_cycles_per_demand_load_l3_hit", "Unit": "cpu_atom" }, { - "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", - "MetricGroup": " ", - "MetricName": "tma_info_turbo_utilization", + "BriefDescription": "load ops retired per 1000 instruction", + "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / INST_RETIRED.ANY", + "MetricName": "tma_info_memory_memloadpki", "Unit": "cpu_atom" }, { - "BriefDescription": "Uops Per Instruction", - "MetricExpr": "UOPS_RETIRED.ALL / INST_RETIRED.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_upi", + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_atom" }, { - "BriefDescription": "Percentage of all uops which are x87 uops", - "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_x87_uop_ratio", + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@k / CPU_CLK_UNHALTED.CORE", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_kernel_utilization", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "tma_info_core_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.", - "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05", @@ -561,7 +535,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a load block.", - "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1", @@ -571,7 +545,7 @@ { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / tma_info_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", + "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.1", @@ -580,7 +554,7 @@ }, { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.", - "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / tma_info_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", + "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.1", @@ -598,7 +572,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", "MetricName": "tma_machine_clears", "MetricThreshold": "tma_machine_clears > 0.05", @@ -608,7 +582,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.", - "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_mem_scheduler", "MetricThreshold": "tma_mem_scheduler > 0.1", @@ -617,7 +591,7 @@ }, { "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads.", - "MetricExpr": "min(cpu_atom@TOPDOWN_BE_BOUND.ALL@ / tma_info_slots, cpu_atom@LD_HEAD.ANY_AT_RET@ / tma_info_clks + tma_store_bound)", + "MetricExpr": "min(cpu_atom@TOPDOWN_BE_BOUND.ALL@ / tma_info_core_slots, cpu_atom@LD_HEAD.ANY_AT_RET@ / tma_info_core_clks + tma_store_bound)", "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2", @@ -636,7 +610,7 @@ }, { "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)", - "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.MS / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_ms_uops", "MetricThreshold": "tma_ms_uops > 0.05", @@ -647,7 +621,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.", - "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_non_mem_scheduler", "MetricThreshold": "tma_non_mem_scheduler > 0.1", @@ -656,7 +630,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear (slow nuke).", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_nuke", "MetricThreshold": "tma_nuke > 0.05", @@ -665,7 +639,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.", - "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_other_fb", "MetricThreshold": "tma_other_fb > 0.05", @@ -674,7 +648,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a number of other load blocks.", - "MetricExpr": "LD_HEAD.OTHER_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.OTHER_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_other_l1", "MetricThreshold": "tma_other_l1 > 0.05", @@ -692,7 +666,7 @@ }, { "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.", - "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@ - cpu_atom@UOPS_RETIRED.FPDIV@) / tma_info_slots", + "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@ - cpu_atom@UOPS_RETIRED.FPDIV@) / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group", "MetricName": "tma_other_ret", "MetricThreshold": "tma_other_ret > 0.3", @@ -710,7 +684,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.", - "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_predecode", "MetricThreshold": "tma_predecode > 0.05", @@ -719,7 +693,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).", - "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_register", "MetricThreshold": "tma_register > 0.1", @@ -728,7 +702,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls).", - "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_reorder_buffer", "MetricThreshold": "tma_reorder_buffer > 0.1", @@ -748,7 +722,7 @@ }, { "BriefDescription": "Counts the numer of issue slots that result in retirement slots.", - "MetricExpr": "TOPDOWN_RETIRING.ALL / tma_info_slots", + "MetricExpr": "TOPDOWN_RETIRING.ALL / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.75", @@ -767,7 +741,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).", - "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_serialization", "MetricThreshold": "tma_serialization > 0.1", @@ -794,7 +768,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a first level TLB miss.", - "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_stlb_hit", "MetricThreshold": "tma_stlb_hit > 0.05", @@ -803,7 +777,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a second level TLB miss requiring a page walk.", - "MetricExpr": "LD_HEAD.PGWALK_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.PGWALK_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_stlb_miss", "MetricThreshold": "tma_stlb_miss > 0.05", @@ -821,7 +795,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.", - "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.05", @@ -830,7 +804,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -839,7 +813,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * cpu_core@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_slots", + "MetricExpr": "100 * cpu_core@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -849,7 +823,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.", - "MetricExpr": "63 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_slots", + "MetricExpr": "63 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_avx_assists", "MetricThreshold": "tma_avx_assists > 0.1", @@ -858,7 +832,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -880,18 +854,18 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -911,7 +885,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_clks", + "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -922,7 +896,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(25 * tma_info_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_average_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_clks", + "MetricExpr": "(25 * tma_info_system_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_average_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -944,7 +918,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "24 * tma_info_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_clks", + "MetricExpr": "24 * tma_info_system_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -954,17 +928,17 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -975,7 +949,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@ / tma_info_clks", + "MetricExpr": "cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@ / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -985,47 +959,47 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_LOAD_MISSES.WALK_ACTIVE@, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_LOAD_MISSES.WALK_ACTIVE@, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_STORE_MISSES.WALK_ACTIVE@) / tma_info_core_clks", + "MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_STORE_MISSES.WALK_ACTIVE@) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "28 * tma_info_average_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_clks", + "MetricExpr": "28 * tma_info_system_average_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1035,11 +1009,11 @@ }, { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1048,15 +1022,15 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -1088,7 +1062,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", - "MetricExpr": "30 * cpu_core@ASSISTS.FP@ / tma_info_slots", + "MetricExpr": "30 * cpu_core@ASSISTS.FP@ / tma_info_thread_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_fp_assists", "MetricThreshold": "tma_fp_assists > 0.1", @@ -1098,7 +1072,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -1108,7 +1082,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -1118,7 +1092,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1128,7 +1102,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1138,7 +1112,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -1149,7 +1123,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", - "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", @@ -1159,7 +1133,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1170,7 +1144,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_DATA.STALLS / tma_info_clks", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1179,251 +1153,300 @@ "Unit": "cpu_core" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "Unit": "cpu_core" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch", + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200", "Unit": "cpu_core" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers", + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.COND@ + 3 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + (cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code", + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "cpu_core@BR_MISP_RETIRED.INDIRECT_CALL\\,umask\\=0x80@ / BR_MISP_RETIRED.INDIRECT", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret", + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500", "Unit": "cpu_core" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks", + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200", "Unit": "cpu_core" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki", + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of branches that are non-taken conditionals", - "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt", + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of branches that are taken conditionals", - "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk", + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: ", "Unit": "cpu_core" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead", "Unit": "cpu_core" }, { - "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@", - "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks", + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.COND@ + 3 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + (cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code", "Unit": "cpu_core" }, { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", - "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc", + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20", "Unit": "cpu_core" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi", + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization", + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store", "Unit": "cpu_core" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp", + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full", + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / cpu_core@UOPS_ISSUED.ANY@", - "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 6 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "BriefDescription": "Fraction of branches that are non-taken conditionals", + "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches;CodeGen;PGO", + "MetricName": "tma_info_branches_cond_nt", "Unit": "cpu_core" }, { - "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", - "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost", + "BriefDescription": "Fraction of branches that are taken conditionals", + "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches;CodeGen;PGO", + "MetricName": "tma_info_branches_cond_tk", "Unit": "cpu_core" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute", + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump", "Unit": "cpu_core" }, { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage.", + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches", "Unit": "cpu_core" }, { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki", + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@", + "MetricGroup": "SMT", + "MetricName": "tma_info_core_core_clks", "Unit": "cpu_core" }, { - "BriefDescription": "Average number of Uops issued by front-end when it issued something", - "MetricExpr": "UOPS_ISSUED.ANY / cpu_core@UOPS_ISSUED.ANY\\,cmask\\=1@", - "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc", + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_core_coreipc", "Unit": "cpu_core" }, { "BriefDescription": "Floating Point Operations Per Cycle", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_clks", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks", "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc", + "MetricName": "tma_info_core_flopc", "Unit": "cpu_core" }, { "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu_core@FP_ARITH_DISPATCHED.PORT_0@ + cpu_core@FP_ARITH_DISPATCHED.PORT_1@ + cpu_core@FP_ARITH_DISPATCHED.PORT_5@) / (2 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@FP_ARITH_DISPATCHED.PORT_0@ + cpu_core@FP_ARITH_DISPATCHED.PORT_1@ + cpu_core@FP_ARITH_DISPATCHED.PORT_5@) / (2 * tma_info_core_core_clks)", "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", + "MetricName": "tma_info_core_fp_arith_utilization", "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common).", "Unit": "cpu_core" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: ", + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / cpu_core@UOPS_ISSUED.ANY@", + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", + "MetricGroup": "DSBmiss", + "MetricName": "tma_info_frontend_dsb_switch_cost", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of Uops issued by front-end when it issued something", + "MetricExpr": "UOPS_ISSUED.ANY / cpu_core@UOPS_ISSUED.ANY\\,cmask\\=1@", + "MetricGroup": "Fed;FetchBW", + "MetricName": "tma_info_frontend_fetch_upc", "Unit": "cpu_core" }, { "BriefDescription": "Average Latency for L1 instruction cache misses", "MetricExpr": "ICACHE_DATA.STALLS / cpu_core@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency", + "MetricName": "tma_info_frontend_icache_miss_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp", + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20", + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", + "MetricExpr": "LSD.UOPS / cpu_core@UOPS_ISSUED.ANY@", + "MetricGroup": "Fed;LSD", + "MetricName": "tma_info_frontend_lsd_coverage", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch", "Unit": "cpu_core" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "cpu_core@INST_RETIRED.ANY@", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST", "Unit": "cpu_core" }, @@ -1431,8 +1454,8 @@ "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.", "Unit": "cpu_core" }, @@ -1440,8 +1463,8 @@ "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", "Unit": "cpu_core" }, @@ -1449,8 +1472,8 @@ "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", "Unit": "cpu_core" }, @@ -1458,8 +1481,8 @@ "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", "Unit": "cpu_core" }, @@ -1467,494 +1490,445 @@ "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.", "Unit": "cpu_core" }, - { - "BriefDescription": "Instructions per a microcode Assist invocation", - "MetricExpr": "INST_RETIRED.ANY / cpu_core@ASSISTS.ANY\\,umask\\=0x1B@", - "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_ipassist", - "MetricThreshold": "tma_info_ipassist < 100e3", - "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)", - "Unit": "cpu_core" - }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc", + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / cpu_core@BR_INST_RETIRED.FAR_BRANCH@u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6", + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10", + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_ntaken", - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_taken", - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "cpu_core@BR_MISP_RETIRED.INDIRECT_CALL\\,umask\\=0x80@ / BR_MISP_RETIRED.INDIRECT", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_ret", - "MetricThreshold": "tma_info_ipmisp_ret < 500", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200", + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8", + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8", "Unit": "cpu_core" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu_core@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100", + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100", "Unit": "cpu_core" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 13", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch", + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 13", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump", + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw", "Unit": "cpu_core" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / cpu_core@INST_RETIRED.ANY_P@k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi", + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05", + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t", + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki", "Unit": "cpu_core" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki", + "MetricName": "tma_info_memory_l1mpki", "Unit": "cpu_core" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t", + "MetricName": "tma_info_memory_l1mpki_load", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all", + "MetricName": "tma_info_memory_l2hpki_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load", + "MetricName": "tma_info_memory_l2hpki_load", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki", + "MetricName": "tma_info_memory_l2mpki", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all", - "Unit": "cpu_core" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code", - "Unit": "cpu_core" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all", + "MetricName": "tma_info_memory_l2mpki_all", "Unit": "cpu_core" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load", - "Unit": "cpu_core" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw", + "MetricName": "tma_info_memory_l2mpki_load", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t", + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw", + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t", + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", "Unit": "cpu_core" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki", + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp", "Unit": "cpu_core" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency", + "MetricName": "tma_info_memory_oro_load_l2_miss_latency", "Unit": "cpu_core" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp", + "MetricName": "tma_info_memory_oro_load_l2_mlp", "Unit": "cpu_core" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l3_miss_latency", + "MetricName": "tma_info_memory_oro_load_l3_miss_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency", + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki", "Unit": "cpu_core" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * cpu_core@DTLB_LOAD_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki", + "MetricName": "tma_info_memory_tlb_load_stlb_mpki", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", - "MetricExpr": "LSD.UOPS / cpu_core@UOPS_ISSUED.ANY@", - "MetricGroup": "Fed;LSD", - "MetricName": "tma_info_lsd_coverage", + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(cpu_core@ITLB_MISSES.WALK_PENDING@ + cpu_core@DTLB_LOAD_MISSES.WALK_PENDING@ + cpu_core@DTLB_STORE_MISSES.WALK_PENDING@) / (4 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5", "Unit": "cpu_core" }, { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / cpu_core@UNC_ARB_DAT_OCCUPANCY.RD\\,cmask\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches", + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki", "Unit": "cpu_core" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)", + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute", "Unit": "cpu_core" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency", + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / cpu_core@ASSISTS.ANY\\,umask\\=0x1B@", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full", + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store", + "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", + "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_strings_cycles", + "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency", + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_core" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_other_branches", + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.", "Unit": "cpu_core" }, { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(cpu_core@ITLB_MISSES.WALK_PENDING@ + cpu_core@DTLB_LOAD_MISSES.WALK_PENDING@ + cpu_core@DTLB_STORE_MISSES.WALK_PENDING@) / (4 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5", + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / cpu_core@BR_INST_RETIRED.FAR_BRANCH@u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6", "Unit": "cpu_core" }, { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_retiring * tma_info_slots / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire", + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / cpu_core@INST_RETIRED.ANY_P@k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi", "Unit": "cpu_core" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "cpu_core@TOPDOWN.SLOTS@", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots", + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05", "Unit": "cpu_core" }, { - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (cpu_core@TOPDOWN.SLOTS@ / 2) if #SMT_on else 1)", - "MetricGroup": "SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_slots_utilization", + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / cpu_core@UNC_ARB_DAT_OCCUPANCY.RD\\,cmask\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", + "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_request_latency", "Unit": "cpu_core" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - cpu_core@CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE@ / cpu_core@CPU_CLK_UNHALTED.REF_DISTRIBUTED@ if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization", + "MetricName": "tma_info_system_smt_2t_utilization", "Unit": "cpu_core" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "UNC_CLOCK.SOCKET", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks", + "MetricName": "tma_info_system_socket_clks", "Unit": "cpu_core" }, { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki", + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization", "Unit": "cpu_core" }, { - "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", - "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_strings_cycles", - "MetricThreshold": "tma_info_strings_cycles > 0.1", + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks", "Unit": "cpu_core" }, { - "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", - "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization", + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi", + "Unit": "cpu_core" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "cpu_core@TOPDOWN.SLOTS@", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (cpu_core@TOPDOWN.SLOTS@ / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization", "Unit": "cpu_core" }, { "BriefDescription": "Uops Per Instruction", - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05", + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05", "Unit": "cpu_core" }, { "BriefDescription": "Instruction per taken branch", - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 9", + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 9", "Unit": "cpu_core" }, { @@ -1969,7 +1943,7 @@ }, { "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_128b", "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1979,7 +1953,7 @@ }, { "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_256b", "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1989,7 +1963,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_TAG.STALLS / tma_info_clks", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1999,7 +1973,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_clks, 0)", + "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2010,7 +1984,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_clks", + "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2020,7 +1994,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_clks", + "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2030,21 +2004,21 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "9 * tma_info_average_frequency * cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_clks", + "MetricExpr": "9 * tma_info_system_average_frequency * cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "DECODE.LCP / tma_info_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2061,7 +2035,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_clks)", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -2080,7 +2054,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -2090,7 +2064,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ - cpu_core@L2_RQSTS.ALL_RFO@) + cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@ * (10 * cpu_core@L2_RQSTS.RFO_HIT@ + min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@))) / tma_info_clks", + "MetricExpr": "(16 * max(0, cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ - cpu_core@L2_RQSTS.ALL_RFO@) + cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@ * (10 * cpu_core@L2_RQSTS.RFO_HIT@ + min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2100,10 +2074,10 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", - "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -2121,27 +2095,27 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -2152,7 +2126,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_clks", + "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -2162,7 +2136,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_light_operations * cpu_core@MEM_UOP_RETIRED.ANY@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * cpu_core@MEM_UOP_RETIRED.ANY@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6", @@ -2171,7 +2145,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.MS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -2181,27 +2155,27 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_clks", + "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", - "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_clks", + "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", @@ -2211,7 +2185,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_slots / cpu_core@UOPS_ISSUED.ANY@) / tma_info_clks", + "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -2221,7 +2195,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", - "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", @@ -2231,7 +2205,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -2252,7 +2226,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", - "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_slots", + "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_page_faults", "MetricThreshold": "tma_page_faults > 0.05", @@ -2262,7 +2236,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -2272,7 +2246,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -2282,7 +2256,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -2292,7 +2266,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_clks)", + "MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -2302,7 +2276,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_clks", + "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -2312,7 +2286,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -2322,7 +2296,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -2332,7 +2306,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -2342,7 +2316,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_slots", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2353,7 +2327,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -2363,7 +2337,7 @@ }, { "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", - "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_slots)", + "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group", "MetricName": "tma_shuffles", "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -2372,7 +2346,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_clks", + "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -2382,7 +2356,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * cpu_core@LD_BLOCKS.NO_SR@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu_core@LD_BLOCKS.NO_SR@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2392,7 +2366,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2402,17 +2376,17 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(cpu_core@XQ.FULL_CYCLES@ + cpu_core@L1D_PEND_MISS.L2_STALLS@) / tma_info_clks", + "MetricExpr": "(cpu_core@XQ.FULL_CYCLES@ + cpu_core@L1D_PEND_MISS.L2_STALLS@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -2422,7 +2396,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_clks", + "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2432,7 +2406,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(cpu_core@MEM_STORE_RETIRED.L2_HIT@ * 10 * (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) + (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) * min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@)) / tma_info_clks", + "MetricExpr": "(cpu_core@MEM_STORE_RETIRED.L2_HIT@ * 10 * (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) + (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) * min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2442,7 +2416,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_4_9@ + cpu_core@UOPS_DISPATCHED.PORT_7_8@) / (4 * tma_info_core_clks)", + "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_4_9@ + cpu_core@UOPS_DISPATCHED.PORT_7_8@) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -2461,7 +2435,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -2470,7 +2444,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * cpu_core@OCR.STREAMING_WR.ANY_RESPONSE@ / tma_info_clks", + "MetricExpr": "9 * cpu_core@OCR.STREAMING_WR.ANY_RESPONSE@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -2480,7 +2454,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_clks", + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/cache.json b/tools/perf/pmu-events/arch/x86/alderlake/cache.json index 51770416bcc2..b3d7f8fb50df 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/cache.json @@ -1017,6 +1017,15 @@ "UMask": "0x1", "Unit": "cpu_core" }, + { + "BriefDescription": "Counts bus locks, accounts for cache line split locks and UC locks.", + "EventCode": "0x2c", + "EventName": "SQ_MISC.BUS_LOCK", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, { "BriefDescription": "Number of PREFETCHNTA instructions executed.", "EventCode": "0x40", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/memory.json b/tools/perf/pmu-events/arch/x86/alderlake/memory.json index 55827b276e6e..73d92d5c9f9d 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/memory.json @@ -93,19 +93,21 @@ "Unit": "cpu_core" }, { - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L2_MISS", + "BriefDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding.", "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_core" }, { - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L3_MISS", + "BriefDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding.", "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json index f4b3c3883643..ed9ff25a03cf 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json @@ -86,7 +86,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions.", - "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_alloc_restriction", "MetricThreshold": "tma_alloc_restriction > 0.1", @@ -94,7 +94,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", - "MetricExpr": "TOPDOWN_BE_BOUND.ALL / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.ALL / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.1", @@ -114,7 +114,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", - "MetricExpr": "(tma_info_slots - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / tma_info_slots", + "MetricExpr": "(tma_info_core_slots - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -124,7 +124,7 @@ }, { "BriefDescription": "Counts the number of uops that are not from the microsequencer.", - "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS) / tma_info_slots", + "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS) / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_base", "MetricThreshold": "tma_base > 0.6", @@ -133,7 +133,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend", - "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_detect", "MetricThreshold": "tma_branch_detect > 0.05", @@ -142,7 +142,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts.", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.05", @@ -151,7 +151,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.", - "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteer", "MetricThreshold": "tma_branch_resteer > 0.05", @@ -159,7 +159,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).", - "MetricExpr": "TOPDOWN_FE_BOUND.CISC / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.CISC / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_cisc", "MetricThreshold": "tma_cisc > 0.05", @@ -176,7 +176,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.", - "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_decode", "MetricThreshold": "tma_decode > 0.05", @@ -193,7 +193,7 @@ { "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / tma_info_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_clks, 0) * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / tma_info_core_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_core_clks, 0) * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1", @@ -201,7 +201,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear classified as a fast nuke due to memory ordering, memory disambiguation and memory renaming.", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_fast_nuke", "MetricThreshold": "tma_fast_nuke > 0.05", @@ -209,7 +209,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", - "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_bandwidth", "MetricThreshold": "tma_fetch_bandwidth > 0.1", @@ -218,7 +218,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", - "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.15", @@ -235,7 +235,7 @@ }, { "BriefDescription": "Counts the number of floating point divide operations per uop.", - "MetricExpr": "UOPS_RETIRED.FPDIV / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.FPDIV / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group", "MetricName": "tma_fpdiv_uops", "MetricThreshold": "tma_fpdiv_uops > 0.2", @@ -243,7 +243,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.", - "MetricExpr": "TOPDOWN_FE_BOUND.ALL / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.ALL / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.2", @@ -252,218 +252,192 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.", - "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05", "ScaleUnit": "100%" }, - { - "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block", - "MetricExpr": "100 * LD_BLOCKS.4K_ALIAS / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricName": "tma_info_address_alias_blocks" - }, - { - "BriefDescription": "Ratio of all branches which mispredict", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": " ", - "MetricName": "tma_info_branch_mispredict_ratio" - }, - { - "BriefDescription": "Ratio between Mispredicted branches and unknown branches", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BACLEARS.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_branch_mispredict_to_unknown_branch_ratio" - }, { "BriefDescription": "", "MetricExpr": "CPU_CLK_UNHALTED.CORE", - "MetricGroup": " ", - "MetricName": "tma_info_clks" + "MetricName": "tma_info_core_clks" }, { "BriefDescription": "", "MetricExpr": "CPU_CLK_UNHALTED.CORE_P", - "MetricGroup": " ", - "MetricName": "tma_info_clks_p" + "MetricName": "tma_info_core_clks_p" }, { "BriefDescription": "Cycles Per Instruction", - "MetricExpr": "tma_info_clks / INST_RETIRED.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": " ", - "MetricName": "tma_info_cpu_utilization" - }, - { - "BriefDescription": "Cycle cost per DRAM hit", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_LOAD_UOPS_RETIRED.DRAM_HIT", - "MetricGroup": " ", - "MetricName": "tma_info_cycles_per_demand_load_dram_hit" - }, - { - "BriefDescription": "Cycle cost per L2 hit", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT", - "MetricGroup": " ", - "MetricName": "tma_info_cycles_per_demand_load_l2_hit" + "MetricExpr": "tma_info_core_clks / INST_RETIRED.ANY", + "MetricName": "tma_info_core_cpi" }, { - "BriefDescription": "Cycle cost per LLC hit", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_LOAD_UOPS_RETIRED.L3_HIT", - "MetricGroup": " ", - "MetricName": "tma_info_cycles_per_demand_load_l3_hit" + "BriefDescription": "Instructions Per Cycle", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricName": "tma_info_core_ipc" }, { - "BriefDescription": "Percentage of all uops which are FPDiv uops", - "MetricExpr": "100 * UOPS_RETIRED.FPDIV / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_fpdiv_uop_ratio" + "BriefDescription": "", + "MetricExpr": "5 * tma_info_core_clks", + "MetricName": "tma_info_core_slots" }, { - "BriefDescription": "Percentage of all uops which are IDiv uops", - "MetricExpr": "100 * UOPS_RETIRED.IDIV / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_idiv_uop_ratio" + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "UOPS_RETIRED.ALL / INST_RETIRED.ANY", + "MetricName": "tma_info_core_upi" }, { "BriefDescription": "Percent of instruction miss cost that hit in DRAM", "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / MEM_BOUND_STALLS.IFETCH", - "MetricGroup": " ", - "MetricName": "tma_info_inst_miss_cost_dramhit_percent" + "MetricName": "tma_info_frontend_inst_miss_cost_dramhit_percent" }, { "BriefDescription": "Percent of instruction miss cost that hit in the L2", "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / MEM_BOUND_STALLS.IFETCH", - "MetricGroup": " ", - "MetricName": "tma_info_inst_miss_cost_l2hit_percent" + "MetricName": "tma_info_frontend_inst_miss_cost_l2hit_percent" }, { "BriefDescription": "Percent of instruction miss cost that hit in the L3", "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / MEM_BOUND_STALLS.IFETCH", - "MetricGroup": " ", - "MetricName": "tma_info_inst_miss_cost_l3hit_percent" + "MetricName": "tma_info_frontend_inst_miss_cost_l3hit_percent" }, { - "BriefDescription": "Instructions per Branch (lower number means higher occurance rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": " ", - "MetricName": "tma_info_ipbranch" + "BriefDescription": "Ratio of all branches which mispredict", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.ALL_BRANCHES", + "MetricName": "tma_info_inst_mix_branch_mispredict_ratio" }, { - "BriefDescription": "Instructions Per Cycle", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": " ", - "MetricName": "tma_info_ipc" + "BriefDescription": "Ratio between Mispredicted branches and unknown branches", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BACLEARS.ANY", + "MetricName": "tma_info_inst_mix_branch_mispredict_to_unknown_branch_ratio" + }, + { + "BriefDescription": "Percentage of all uops which are FPDiv uops", + "MetricExpr": "100 * UOPS_RETIRED.FPDIV / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_fpdiv_uop_ratio" + }, + { + "BriefDescription": "Percentage of all uops which are IDiv uops", + "MetricExpr": "100 * UOPS_RETIRED.IDIV / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_idiv_uop_ratio" + }, + { + "BriefDescription": "Instructions per Branch (lower number means higher occurance rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", + "MetricName": "tma_info_inst_mix_ipbranch" }, { "BriefDescription": "Instruction per (near) call (lower number means higher occurance rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.CALL", - "MetricGroup": " ", - "MetricName": "tma_info_ipcall" + "MetricName": "tma_info_inst_mix_ipcall" }, { "BriefDescription": "Instructions per Far Branch", "MetricExpr": "INST_RETIRED.ANY / (BR_INST_RETIRED.FAR_BRANCH / 2)", - "MetricGroup": " ", - "MetricName": "tma_info_ipfarbranch" + "MetricName": "tma_info_inst_mix_ipfarbranch" }, { "BriefDescription": "Instructions per Load", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricGroup": " ", - "MetricName": "tma_info_ipload" + "MetricName": "tma_info_inst_mix_ipload" }, { "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken", "MetricExpr": "INST_RETIRED.ANY / (BR_MISP_RETIRED.COND - BR_MISP_RETIRED.COND_TAKEN)", - "MetricName": "tma_info_ipmisp_cond_ntaken" + "MetricName": "tma_info_inst_mix_ipmisp_cond_ntaken" }, { "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was taken", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricName": "tma_info_ipmisp_cond_taken" + "MetricName": "tma_info_inst_mix_ipmisp_cond_taken" }, { "BriefDescription": "Instructions per retired indirect call or jump Branch Misprediction", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricName": "tma_info_ipmisp_indirect" + "MetricName": "tma_info_inst_mix_ipmisp_indirect" }, { "BriefDescription": "Instructions per retired return Branch Misprediction", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RETURN", - "MetricName": "tma_info_ipmisp_ret" + "MetricName": "tma_info_inst_mix_ipmisp_ret" }, { "BriefDescription": "Instructions per retired Branch Misprediction", "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": " ", - "MetricName": "tma_info_ipmispredict" + "MetricName": "tma_info_inst_mix_ipmispredict" }, { "BriefDescription": "Instructions per Store", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", - "MetricGroup": " ", - "MetricName": "tma_info_ipstore" + "MetricName": "tma_info_inst_mix_ipstore" }, { - "BriefDescription": "Fraction of cycles spent in Kernel mode", - "MetricExpr": "cpu@CPU_CLK_UNHALTED.CORE@k / CPU_CLK_UNHALTED.CORE", - "MetricGroup": " ", - "MetricName": "tma_info_kernel_utilization" + "BriefDescription": "Percentage of all uops which are ucode ops", + "MetricExpr": "100 * UOPS_RETIRED.MS / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_microcode_uop_ratio" + }, + { + "BriefDescription": "Percentage of all uops which are x87 uops", + "MetricExpr": "100 * UOPS_RETIRED.X87 / UOPS_RETIRED.ALL", + "MetricName": "tma_info_inst_mix_x87_uop_ratio" + }, + { + "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block", + "MetricExpr": "100 * LD_BLOCKS.4K_ALIAS / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricName": "tma_info_l1_bound_address_alias_blocks" }, { "BriefDescription": "Percentage of total non-speculative loads that are splits", "MetricExpr": "100 * MEM_UOPS_RETIRED.SPLIT_LOADS / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricName": "tma_info_load_splits" + "MetricName": "tma_info_l1_bound_load_splits" }, { - "BriefDescription": "load ops retired per 1000 instruction", - "MetricExpr": "1e3 * MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_memloadpki" + "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block", + "MetricExpr": "100 * LD_BLOCKS.DATA_UNKNOWN / MEM_UOPS_RETIRED.ALL_LOADS", + "MetricName": "tma_info_l1_bound_store_fwd_blocks" }, { - "BriefDescription": "Percentage of all uops which are ucode ops", - "MetricExpr": "100 * UOPS_RETIRED.MS / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_microcode_uop_ratio" + "BriefDescription": "Cycle cost per DRAM hit", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_LOAD_UOPS_RETIRED.DRAM_HIT", + "MetricName": "tma_info_memory_cycles_per_demand_load_dram_hit" }, { - "BriefDescription": "", - "MetricExpr": "5 * tma_info_clks", - "MetricGroup": " ", - "MetricName": "tma_info_slots" + "BriefDescription": "Cycle cost per L2 hit", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT", + "MetricName": "tma_info_memory_cycles_per_demand_load_l2_hit" }, { - "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block", - "MetricExpr": "100 * LD_BLOCKS.DATA_UNKNOWN / MEM_UOPS_RETIRED.ALL_LOADS", - "MetricName": "tma_info_store_fwd_blocks" + "BriefDescription": "Cycle cost per LLC hit", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_LOAD_UOPS_RETIRED.L3_HIT", + "MetricName": "tma_info_memory_cycles_per_demand_load_l3_hit" }, { - "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", - "MetricGroup": " ", - "MetricName": "tma_info_turbo_utilization" + "BriefDescription": "load ops retired per 1000 instruction", + "MetricExpr": "1e3 * MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "tma_info_memory_memloadpki" }, { - "BriefDescription": "Uops Per Instruction", - "MetricExpr": "UOPS_RETIRED.ALL / INST_RETIRED.ANY", - "MetricGroup": " ", - "MetricName": "tma_info_upi" + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricName": "tma_info_system_cpu_utilization" }, { - "BriefDescription": "Percentage of all uops which are x87 uops", - "MetricExpr": "100 * UOPS_RETIRED.X87 / UOPS_RETIRED.ALL", - "MetricGroup": " ", - "MetricName": "tma_info_x87_uop_ratio" + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "cpu@CPU_CLK_UNHALTED.CORE@k / CPU_CLK_UNHALTED.CORE", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_kernel_utilization" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "tma_info_core_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization" }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.", - "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05", @@ -471,7 +445,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a load block.", - "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1", @@ -480,7 +454,7 @@ { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / tma_info_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_clks, 0) * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / tma_info_core_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_core_clks, 0) * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.1", @@ -488,7 +462,7 @@ }, { "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.", - "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / tma_info_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_clks, 0) * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD", + "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / tma_info_core_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_core_clks, 0) * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD", "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.1", @@ -504,7 +478,7 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", "MetricName": "tma_machine_clears", "MetricThreshold": "tma_machine_clears > 0.05", @@ -513,7 +487,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.", - "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_mem_scheduler", "MetricThreshold": "tma_mem_scheduler > 0.1", @@ -521,7 +495,7 @@ }, { "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads.", - "MetricExpr": "min(tma_backend_bound, LD_HEAD.ANY_AT_RET / tma_info_clks + tma_store_bound)", + "MetricExpr": "min(tma_backend_bound, LD_HEAD.ANY_AT_RET / tma_info_core_clks + tma_store_bound)", "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2", @@ -538,7 +512,7 @@ }, { "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)", - "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.MS / tma_info_core_slots", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_ms_uops", "MetricThreshold": "tma_ms_uops > 0.05", @@ -548,7 +522,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.", - "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_non_mem_scheduler", "MetricThreshold": "tma_non_mem_scheduler > 0.1", @@ -556,7 +530,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear (slow nuke).", - "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / tma_info_slots", + "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_nuke", "MetricThreshold": "tma_nuke > 0.05", @@ -564,7 +538,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.", - "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_other_fb", "MetricThreshold": "tma_other_fb > 0.05", @@ -572,7 +546,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a number of other load blocks.", - "MetricExpr": "LD_HEAD.OTHER_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.OTHER_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_other_l1", "MetricThreshold": "tma_other_l1 > 0.05", @@ -588,7 +562,7 @@ }, { "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.", - "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS - UOPS_RETIRED.FPDIV) / tma_info_slots", + "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS - UOPS_RETIRED.FPDIV) / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group", "MetricName": "tma_other_ret", "MetricThreshold": "tma_other_ret > 0.3", @@ -604,7 +578,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.", - "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / tma_info_slots", + "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_predecode", "MetricThreshold": "tma_predecode > 0.05", @@ -612,7 +586,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).", - "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_register", "MetricThreshold": "tma_register > 0.1", @@ -620,7 +594,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls).", - "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_reorder_buffer", "MetricThreshold": "tma_reorder_buffer > 0.1", @@ -638,7 +612,7 @@ }, { "BriefDescription": "Counts the numer of issue slots that result in retirement slots.", - "MetricExpr": "TOPDOWN_RETIRING.ALL / tma_info_slots", + "MetricExpr": "TOPDOWN_RETIRING.ALL / tma_info_core_slots", "MetricGroup": "TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.75", @@ -655,7 +629,7 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).", - "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / tma_info_slots", + "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / tma_info_core_slots", "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", "MetricName": "tma_serialization", "MetricThreshold": "tma_serialization > 0.1", @@ -679,7 +653,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a first level TLB miss.", - "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_stlb_hit", "MetricThreshold": "tma_stlb_hit > 0.05", @@ -687,7 +661,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a second level TLB miss requiring a page walk.", - "MetricExpr": "LD_HEAD.PGWALK_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.PGWALK_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_stlb_miss", "MetricThreshold": "tma_stlb_miss > 0.05", @@ -703,7 +677,7 @@ }, { "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.", - "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / tma_info_clks", + "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / tma_info_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.05", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 66c37a3cbf43..c8d564f6091d 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -1,6 +1,6 @@ Family-model,Version,Filename,EventType -GenuineIntel-6-(97|9A|B7|BA|BF),v1.20,alderlake,core -GenuineIntel-6-BE,v1.20,alderlaken,core +GenuineIntel-6-(97|9A|B7|BA|BF),v1.21,alderlake,core +GenuineIntel-6-BE,v1.21,alderlaken,core GenuineIntel-6-(1C|26|27|35|36),v4,bonnell,core GenuineIntel-6-(3D|47),v27,broadwell,core GenuineIntel-6-56,v9,broadwellde,core -- cgit v1.2.3 From 7d124303d620cc29baba318bb313edf794c9ef60 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:51 -0700 Subject: perf vendor events intel: Update broadwell variant events/metrics Update broadwell events to v28, broadwellde to v10, broadwellx to v21. Including the new events FP_ARITH_INST_RETIRED.VECTOR, and FP_ARITH_INST_RETIRED.4_FLOPS. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/broadwell/bdw-metrics.json | 580 +++++++-------- .../arch/x86/broadwell/floating-point.json | 15 + .../arch/x86/broadwellde/bdwde-metrics.json | 556 +++++++------- .../arch/x86/broadwellde/floating-point.json | 15 + .../arch/x86/broadwellx/bdx-metrics.json | 796 +++++++++++++-------- .../arch/x86/broadwellx/floating-point.json | 15 + tools/perf/pmu-events/arch/x86/mapfile.csv | 6 +- 7 files changed, 1118 insertions(+), 865 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json index f9e2316601e1..55a10b0bf36f 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -71,7 +71,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -81,7 +81,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -89,7 +89,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -109,7 +109,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -125,12 +125,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -159,7 +159,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -180,7 +180,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -189,7 +189,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -199,7 +199,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -208,25 +208,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -235,7 +235,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -244,7 +244,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -254,11 +254,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -266,14 +266,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -328,7 +328,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -348,435 +348,435 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, - { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" - }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" - }, - { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." - }, - { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency" + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / tma_info_core_clks", + "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / tma_info_core_core_clks", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel requests to external memory", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_parallel_requests", + "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" + }, + { + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_request_latency" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "UNC_CLOCK.SOCKET", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -785,7 +785,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -794,7 +794,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -804,7 +804,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -814,7 +814,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -823,11 +823,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -843,7 +843,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -853,7 +853,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -873,16 +873,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -892,7 +892,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -902,7 +902,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -915,21 +915,21 @@ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -938,7 +938,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -947,7 +947,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -956,7 +956,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -965,7 +965,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -983,7 +983,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -992,7 +992,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1001,7 +1001,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -1011,7 +1011,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1020,7 +1020,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1029,7 +1029,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1038,7 +1038,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1047,7 +1047,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1055,7 +1055,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1066,7 +1066,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1075,7 +1075,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1084,16 +1084,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1102,7 +1102,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1112,7 +1112,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1121,7 +1121,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1138,7 +1138,7 @@ }, { "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS", + "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_x87_use", "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", diff --git a/tools/perf/pmu-events/arch/x86/broadwell/floating-point.json b/tools/perf/pmu-events/arch/x86/broadwell/floating-point.json index e4826dc7f797..986869252e71 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/floating-point.json @@ -31,6 +31,14 @@ "SampleAfterValue": "2000003", "UMask": "0x20" }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xc7", + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "2000003", + "UMask": "0x18" + }, { "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.", "EventCode": "0xc7", @@ -76,6 +84,13 @@ "SampleAfterValue": "2000005", "UMask": "0x2a" }, + { + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", + "EventCode": "0xc7", + "EventName": "FP_ARITH_INST_RETIRED.VECTOR", + "SampleAfterValue": "2000003", + "UMask": "0xfc" + }, { "BriefDescription": "Cycles with any input/output SSE or FP assist", "CounterMask": "1", diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json index e9c46d336a8e..8fc62b8f667d 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json @@ -65,7 +65,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -75,7 +75,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -83,7 +83,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -103,7 +103,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -119,12 +119,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -153,7 +153,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -174,7 +174,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -183,7 +183,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -193,7 +193,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -202,25 +202,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -229,7 +229,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -239,11 +239,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -251,14 +251,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -313,7 +313,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -333,417 +333,417 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", "ScaleUnit": "100%" }, - { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" - }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" - }, - { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." - }, - { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / tma_info_core_clks", + "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / tma_info_core_core_clks", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -752,7 +752,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -761,7 +761,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -771,7 +771,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -781,7 +781,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -790,11 +790,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -810,7 +810,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -820,7 +820,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -840,16 +840,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -859,7 +859,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -869,7 +869,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -882,21 +882,21 @@ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -905,7 +905,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -914,7 +914,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -923,7 +923,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -931,7 +931,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -948,7 +948,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -957,7 +957,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -966,7 +966,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -975,7 +975,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -984,7 +984,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -993,7 +993,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1002,7 +1002,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1011,7 +1011,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1020,7 +1020,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1031,7 +1031,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1040,7 +1040,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1049,16 +1049,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1067,7 +1067,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1077,7 +1077,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1086,7 +1086,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1104,7 +1104,7 @@ }, { "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS", + "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_x87_use", "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json b/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json index e4826dc7f797..986869252e71 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/floating-point.json @@ -31,6 +31,14 @@ "SampleAfterValue": "2000003", "UMask": "0x20" }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xc7", + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "2000003", + "UMask": "0x18" + }, { "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.", "EventCode": "0xc7", @@ -76,6 +84,13 @@ "SampleAfterValue": "2000005", "UMask": "0x2a" }, + { + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", + "EventCode": "0xc7", + "EventName": "FP_ARITH_INST_RETIRED.VECTOR", + "SampleAfterValue": "2000003", + "UMask": "0xfc" + }, { "BriefDescription": "Cycles with any input/output SSE or FP assist", "CounterMask": "1", diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index 437b9867acb9..b319e4edc238 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -50,10 +50,206 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "(cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ + cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x180\\,filter_tid\\=0x3e@) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x181@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x191@) / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x192@) / INST_RETIRED.ANY", + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / (cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / (cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from loop stream detector(LSD) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_loop_stream_detector", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Intel(R) Quick Path Interconnect (QPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_Q_TxL_FLITS_G0.DATA * 8 / 1e6 / duration_time", + "MetricName": "qpi_data_transmit_bw", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -69,9 +265,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_UOPS_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -81,7 +283,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -89,7 +291,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -109,7 +311,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -125,12 +327,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -159,7 +361,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -180,7 +382,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -189,7 +391,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -199,7 +401,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -208,25 +410,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -235,7 +437,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -244,7 +446,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_clks", + "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -254,11 +456,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -266,14 +468,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -328,7 +530,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -348,436 +550,436 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, - { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" - }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." - }, - { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_socket_clks / duration_time)", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / (2 * tma_info_core_clks)", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / (2 * tma_info_core_core_clks)", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_system_socket_clks / duration_time)", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cbox_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_clks", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -786,7 +988,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -795,7 +997,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -805,7 +1007,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -815,7 +1017,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -824,11 +1026,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -844,7 +1046,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -854,7 +1056,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -864,7 +1066,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -884,16 +1086,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -903,7 +1105,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -913,7 +1115,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -926,21 +1128,21 @@ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -949,7 +1151,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -958,7 +1160,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -967,7 +1169,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -976,7 +1178,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -994,7 +1196,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1003,7 +1205,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1012,7 +1214,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -1022,7 +1224,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1031,7 +1233,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1040,7 +1242,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1049,7 +1251,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1058,7 +1260,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1067,7 +1269,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks", + "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1077,7 +1279,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1086,7 +1288,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1097,7 +1299,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1106,7 +1308,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1115,16 +1317,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1133,7 +1335,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1143,7 +1345,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1152,7 +1354,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1169,11 +1371,17 @@ }, { "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS", + "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_x87_use", "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" } ] diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/floating-point.json b/tools/perf/pmu-events/arch/x86/broadwellx/floating-point.json index e4826dc7f797..986869252e71 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/floating-point.json @@ -31,6 +31,14 @@ "SampleAfterValue": "2000003", "UMask": "0x20" }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xc7", + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "2000003", + "UMask": "0x18" + }, { "BriefDescription": "Number of SSE/AVX computational double precision floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.", "EventCode": "0xc7", @@ -76,6 +84,13 @@ "SampleAfterValue": "2000005", "UMask": "0x2a" }, + { + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", + "EventCode": "0xc7", + "EventName": "FP_ARITH_INST_RETIRED.VECTOR", + "SampleAfterValue": "2000003", + "UMask": "0xfc" + }, { "BriefDescription": "Cycles with any input/output SSE or FP assist", "CounterMask": "1", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index c8d564f6091d..4a7281be24ac 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -2,9 +2,9 @@ Family-model,Version,Filename,EventType GenuineIntel-6-(97|9A|B7|BA|BF),v1.21,alderlake,core GenuineIntel-6-BE,v1.21,alderlaken,core GenuineIntel-6-(1C|26|27|35|36),v4,bonnell,core -GenuineIntel-6-(3D|47),v27,broadwell,core -GenuineIntel-6-56,v9,broadwellde,core -GenuineIntel-6-4F,v20,broadwellx,core +GenuineIntel-6-(3D|47),v28,broadwell,core +GenuineIntel-6-56,v10,broadwellde,core +GenuineIntel-6-4F,v21,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.17,cascadelakex,core GenuineIntel-6-9[6C],v1.03,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core -- cgit v1.2.3 From 8c61edb840df87f2a5e370c7dfe954d07cf4db2d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:52 -0700 Subject: perf vendor events intel: Update cascadelakex events/metrics Update cascadelakex to v1.18 including the new events INT_MISC.CLEARS_COUNT, FP_ARITH_INST_RETIRED.VECTOR, FP_ARITH_INST_RETIRED.SCALAR, FP_ARITH_INST_RETIRED.8_FLOPS and FP_ARITH_INST_RETIRED.4_FLOPS. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/cascadelakex/clx-metrics.json | 1231 ++++++++++++-------- .../arch/x86/cascadelakex/floating-point.json | 31 + .../pmu-events/arch/x86/cascadelakex/pipeline.json | 23 +- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 4 files changed, 794 insertions(+), 493 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 875c766222e3..0e2e446ced7a 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -50,10 +50,237 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "dtlb_2mb_large_page_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "(UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART0 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART1 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART2 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART3) * 4 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12CC0233@ / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40433@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40433@) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40432@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40431@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12D40433@ / INST_RETIRED.ANY", + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_RPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_PMM_RPQ_INSERTS + UNC_M_PMM_WPQ_INSERTS) * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_WPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -69,9 +296,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -80,7 +313,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -88,7 +321,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_slots", + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -97,7 +330,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -107,7 +340,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -123,12 +356,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -146,7 +379,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -156,7 +389,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + 44 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -177,7 +410,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -186,16 +419,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -205,7 +438,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound - tma_pmm_bound if #has_pmem > 0 else CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound)", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound - tma_pmm_bound if #has_pmem > 0 else CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -214,45 +447,45 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(110 * tma_info_average_frequency * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_average_frequency * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_clks", + "MetricExpr": "(110 * tma_info_system_average_frequency * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -262,11 +495,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -274,14 +507,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -356,7 +589,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -375,7 +608,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_slots", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_thread_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -385,7 +618,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_clks", + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -393,705 +626,711 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_core_ipmispredict", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", + "MetricName": "tma_info_core_ipmispredict", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_read_bw" - }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_write_bw" - }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" - }, - { - "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", - "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { - "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", - "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" - }, - { - "BriefDescription": "Instruction per taken branch", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", + "MetricGroup": "InsType", + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", + "MetricGroup": "Prefetches", + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" }, { - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" }, { - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_nonsilent_pki" + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_silent_pki" + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" + }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" + }, + { + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_read_bw" + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_write_bw" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_dram_read_latency", + "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, { "BriefDescription": "Average number of parallel data read requests to external memory", "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", + "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", "MetricExpr": "(1e9 * (UNC_M_PMM_RPQ_OCCUPANCY.ALL / UNC_M_PMM_RPQ_INSERTS) / imc_0@event\\=0x0@ if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_pmm_read_latency", + "MetricName": "tma_info_system_mem_pmm_read_latency", "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", + "MetricName": "tma_info_system_mem_read_latency", "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, - { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" - }, - { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" - }, - { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" - }, - { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" - }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_read_bw" + "MetricName": "tma_info_system_pmm_read_bw" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_write_bw" + "MetricName": "tma_info_system_pmm_write_bw" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks)", + "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks)", "MetricGroup": "Power", - "MetricName": "tma_info_power_license0_utilization", + "MetricName": "tma_info_system_power_license0_utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks)", + "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks)", "MetricGroup": "Power", - "MetricName": "tma_info_power_license1_utilization", - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", + "MetricName": "tma_info_system_power_license1_utilization", + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks)", + "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks)", "MetricGroup": "Power", - "MetricName": "tma_info_power_license2_utilization", - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", + "MetricName": "tma_info_system_power_license2_utilization", + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cha_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" - }, - { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1100,7 +1339,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1110,7 +1349,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1119,7 +1358,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1128,20 +1367,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "17 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1156,7 +1395,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1174,7 +1413,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1182,7 +1421,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "59.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1191,7 +1430,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", - "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1211,20 +1450,20 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1248,7 +1487,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1257,19 +1496,19 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1284,7 +1523,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1321,7 +1560,7 @@ { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1330,7 +1569,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1339,7 +1578,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1348,7 +1587,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -1357,7 +1596,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -1375,7 +1614,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1384,7 +1623,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1393,7 +1632,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -1402,7 +1641,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", + "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1411,7 +1650,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_clks", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1420,7 +1659,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_clks", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1429,7 +1668,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_clks", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1438,7 +1677,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_clks", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1447,7 +1686,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(89.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1456,7 +1695,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "127 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1465,7 +1704,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1475,7 +1714,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1484,7 +1723,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "40 * ROB_MISC_EVENTS.PAUSE_INST / tma_info_clks", + "MetricExpr": "40 * ROB_MISC_EVENTS.PAUSE_INST / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1494,7 +1733,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1503,7 +1742,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1512,16 +1751,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1530,7 +1769,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1540,7 +1779,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1549,7 +1788,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1565,7 +1804,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1573,7 +1812,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "9 * BACLEARS.ANY / tma_info_clks", + "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -1616,5 +1855,17 @@ "MetricGroup": "transaction", "MetricName": "tsx_transactional_cycles", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_transmit_bw", + "ScaleUnit": "1MB/s" } ] diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/floating-point.json b/tools/perf/pmu-events/arch/x86/cascadelakex/floating-point.json index 1f46e6b33856..bb4d5101f962 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/floating-point.json @@ -31,6 +31,14 @@ "SampleAfterValue": "2000003", "UMask": "0x20" }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x18" + }, { "BriefDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "EventCode": "0xC7", @@ -47,6 +55,22 @@ "SampleAfterValue": "2000003", "UMask": "0x80" }, + { + "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.8_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x18" + }, + { + "BriefDescription": "Counts once for most SIMD scalar computational floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.SCALAR", + "PublicDescription": "Counts once for most SIMD scalar computational single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SIMD scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "2000003", + "UMask": "0x3" + }, { "BriefDescription": "Counts once for most SIMD scalar computational double precision floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", "EventCode": "0xC7", @@ -63,6 +87,13 @@ "SampleAfterValue": "2000003", "UMask": "0x2" }, + { + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.VECTOR", + "SampleAfterValue": "2000003", + "UMask": "0xfc" + }, { "BriefDescription": "Intel AVX-512 computational 512-bit packed BFloat16 instructions retired.", "EventCode": "0xCF", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json b/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json index 0f06e314fe36..31a1663d57f8 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/pipeline.json @@ -26,12 +26,21 @@ "UMask": "0x4" }, { - "BriefDescription": "Conditional branch instructions retired.", + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", + "Errata": "SKL091", + "EventCode": "0xC4", + "EventName": "BR_INST_RETIRED.COND", + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", + "SampleAfterValue": "400009", + "UMask": "0x1" + }, + { + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", "Errata": "SKL091", "EventCode": "0xC4", "EventName": "BR_INST_RETIRED.CONDITIONAL", "PEBS": "1", - "PublicDescription": "This event counts conditional branch instructions retired.", + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", "SampleAfterValue": "400009", "UMask": "0x1" }, @@ -413,6 +422,16 @@ "SampleAfterValue": "2000003", "UMask": "0x1" }, + { + "BriefDescription": "Clears speculative count", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0x0D", + "EventName": "INT_MISC.CLEARS_COUNT", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", + "SampleAfterValue": "2000003", + "UMask": "0x1" + }, { "BriefDescription": "Cycles the issue-stage is waiting for front-end to fetch from resteered path following branch misprediction or machine clear events.", "EventCode": "0x0D", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 4a7281be24ac..6b132eecd2a7 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -5,7 +5,7 @@ GenuineIntel-6-(1C|26|27|35|36),v4,bonnell,core GenuineIntel-6-(3D|47),v28,broadwell,core GenuineIntel-6-56,v10,broadwellde,core GenuineIntel-6-4F,v21,broadwellx,core -GenuineIntel-6-55-[56789ABCDEF],v1.17,cascadelakex,core +GenuineIntel-6-55-[56789ABCDEF],v1.18,cascadelakex,core GenuineIntel-6-9[6C],v1.03,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -- cgit v1.2.3 From 27aebf378b0d8ecb69fa7db88ef016cfb8e6e37a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:53 -0700 Subject: perf vendor events intel: Update elkhartlake events Update elkhartlake to v1.04 that marks deprecated a number of events and adds additional description to MEM_BOUND_STALLS.IFETCH. The events data was generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/elkhartlake/cache.json | 7 +++++++ tools/perf/pmu-events/arch/x86/elkhartlake/memory.json | 2 ++ tools/perf/pmu-events/arch/x86/elkhartlake/other.json | 10 ++++++++++ tools/perf/pmu-events/arch/x86/elkhartlake/pipeline.json | 3 +++ tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/x86/elkhartlake/cache.json b/tools/perf/pmu-events/arch/x86/elkhartlake/cache.json index 0ab90e3bf76b..c6be60584522 100644 --- a/tools/perf/pmu-events/arch/x86/elkhartlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/elkhartlake/cache.json @@ -72,6 +72,7 @@ "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).", "EventCode": "0x34", "EventName": "MEM_BOUND_STALLS.IFETCH", + "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or translation lookaside buffer (TLB) miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).", "SampleAfterValue": "200003", "UMask": "0x38" }, @@ -437,6 +438,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT", "MSRIndex": "0x1a6,0x1a7", @@ -446,6 +448,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_HITM", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", "MSRIndex": "0x1a6,0x1a7", @@ -455,6 +458,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_HIT_NO_FWD", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD", "MSRIndex": "0x1a6,0x1a7", @@ -464,6 +468,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_HIT_WITH_FWD", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", "MSRIndex": "0x1a6,0x1a7", @@ -473,6 +478,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_MISS", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -482,6 +488,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_NOT_NEEDED", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_NOT_NEEDED", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/elkhartlake/memory.json b/tools/perf/pmu-events/arch/x86/elkhartlake/memory.json index 18621909d1a9..c02eb0e836ad 100644 --- a/tools/perf/pmu-events/arch/x86/elkhartlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/elkhartlake/memory.json @@ -96,6 +96,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_MISS", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -105,6 +106,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_MISS_LOCAL", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/elkhartlake/other.json b/tools/perf/pmu-events/arch/x86/elkhartlake/other.json index 00ae180ded25..fefbc383b840 100644 --- a/tools/perf/pmu-events/arch/x86/elkhartlake/other.json +++ b/tools/perf/pmu-events/arch/x86/elkhartlake/other.json @@ -1,6 +1,7 @@ [ { "BriefDescription": "This event is deprecated. Refer to new event BUS_LOCK.SELF_LOCKS", + "Deprecated": "1", "EdgeDetect": "1", "EventCode": "0x63", "EventName": "BUS_LOCK.ALL", @@ -16,6 +17,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event BUS_LOCK.BLOCK_CYCLES", + "Deprecated": "1", "EventCode": "0x63", "EventName": "BUS_LOCK.CYCLES_OTHER_BLOCK", "SampleAfterValue": "200003", @@ -23,6 +25,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event BUS_LOCK.LOCK_CYCLES", + "Deprecated": "1", "EventCode": "0x63", "EventName": "BUS_LOCK.CYCLES_SELF_BLOCK", "SampleAfterValue": "200003", @@ -46,6 +49,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event MEM_BOUND_STALLS.LOAD_DRAM_HIT", + "Deprecated": "1", "EventCode": "0x34", "EventName": "C0_STALLS.LOAD_DRAM_HIT", "SampleAfterValue": "200003", @@ -53,6 +57,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event MEM_BOUND_STALLS.LOAD_L2_HIT", + "Deprecated": "1", "EventCode": "0x34", "EventName": "C0_STALLS.LOAD_L2_HIT", "SampleAfterValue": "200003", @@ -60,6 +65,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event MEM_BOUND_STALLS.LOAD_LLC_HIT", + "Deprecated": "1", "EventCode": "0x34", "EventName": "C0_STALLS.LOAD_LLC_HIT", "SampleAfterValue": "200003", @@ -207,6 +213,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.ANY_RESPONSE", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -216,6 +223,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.DRAM", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -225,6 +233,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.LOCAL_DRAM", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.LOCAL_DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -234,6 +243,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.OUTSTANDING", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.OUTSTANDING", "MSRIndex": "0x1a6", diff --git a/tools/perf/pmu-events/arch/x86/elkhartlake/pipeline.json b/tools/perf/pmu-events/arch/x86/elkhartlake/pipeline.json index 9dd8c909facc..c483c0838e08 100644 --- a/tools/perf/pmu-events/arch/x86/elkhartlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/elkhartlake/pipeline.json @@ -165,6 +165,7 @@ }, { "BriefDescription": "This event is deprecated.", + "Deprecated": "1", "EventCode": "0xcd", "EventName": "CYCLES_DIV_BUSY.ANY", "SampleAfterValue": "2000003" @@ -283,6 +284,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event TOPDOWN_BAD_SPECULATION.FASTNUKE", + "Deprecated": "1", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.MONUKE", "SampleAfterValue": "1000003", @@ -338,6 +340,7 @@ }, { "BriefDescription": "This event is deprecated.", + "Deprecated": "1", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.STORE_BUFFER", "SampleAfterValue": "1000003", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 6b132eecd2a7..f3ae41e28ed2 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -6,7 +6,7 @@ GenuineIntel-6-(3D|47),v28,broadwell,core GenuineIntel-6-56,v10,broadwellde,core GenuineIntel-6-4F,v21,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.18,cascadelakex,core -GenuineIntel-6-9[6C],v1.03,elkhartlake,core +GenuineIntel-6-9[6C],v1.04,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-B6,v1.00,grandridge,core -- cgit v1.2.3 From c9e7771f28d083dab1afccdd72c3712e31aea4d5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:54 -0700 Subject: perf vendor events intel: Update haswell(x) metrics Metrics are updated to make TMA info metric names synchronized. Metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/haswell/hsw-metrics.json | 484 +++++++------- .../pmu-events/arch/x86/haswellx/hsx-metrics.json | 700 +++++++++++++-------- 2 files changed, 696 insertions(+), 488 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json index 9570a88d6d1c..79d89c263677 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -71,7 +71,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -81,7 +81,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -89,7 +89,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -109,7 +109,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -125,12 +125,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -150,7 +150,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -171,7 +171,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -180,7 +180,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_clks", + "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -190,7 +190,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -199,25 +199,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -226,7 +226,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -235,7 +235,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE / tma_info_clks", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -245,11 +245,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -257,14 +257,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -274,7 +274,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -294,324 +294,324 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" - }, - { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" - }, - { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" + }, + { + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency" + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_clks", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_core_clks", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel requests to external memory", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_parallel_requests", + "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" + }, + { + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_request_latency" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "UNC_CLOCK.SOCKET", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -620,7 +620,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_clks, 0)", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -629,7 +629,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -639,7 +639,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -649,7 +649,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -658,11 +658,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -678,7 +678,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -688,7 +688,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -708,16 +708,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -727,7 +727,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -737,7 +737,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -746,16 +746,16 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -764,7 +764,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -773,7 +773,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -782,7 +782,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -791,7 +791,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -809,7 +809,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -818,7 +818,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -827,7 +827,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -837,7 +837,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_clks", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -846,7 +846,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -855,7 +855,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -864,7 +864,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -873,7 +873,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -881,7 +881,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -892,7 +892,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -901,7 +901,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -910,16 +910,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -928,7 +928,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -938,7 +938,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -947,7 +947,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -955,7 +955,7 @@ }, { "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS", + "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_x87_use", "MetricThreshold": "tma_x87_use > 0.1", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index a522202cf684..5f451948c893 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -50,10 +50,206 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x181@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x191@) / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x192@) / INST_RETIRED.ANY", + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / (cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / (cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from loop stream detector(LSD) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "(UOPS_ISSUED.ANY - IDQ.MITE_UOPS - IDQ.MS_UOPS - IDQ.DSB_UOPS) / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_loop_stream_detector", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Intel(R) Quick Path Interconnect (QPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_Q_TxL_FLITS_G0.DATA * 8 / 1e6 / duration_time", + "MetricName": "qpi_data_transmit_bw", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -69,9 +265,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_UOPS_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -81,7 +283,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -89,7 +291,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -109,7 +311,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -125,12 +327,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -150,7 +352,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -171,7 +373,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -180,7 +382,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_clks", + "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -190,7 +392,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -199,25 +401,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -226,7 +428,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -235,7 +437,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_clks", + "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -245,11 +447,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -257,14 +459,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -274,7 +476,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -294,325 +496,325 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" - }, - { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" - }, - { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" + }, + { + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_socket_clks / duration_time)", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_clks", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_core_clks", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_system_socket_clks / duration_time)", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cbox_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -621,7 +823,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_clks, 0)", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -630,7 +832,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -640,7 +842,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -650,7 +852,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -659,11 +861,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -679,7 +881,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -689,7 +891,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -699,7 +901,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -719,16 +921,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -738,7 +940,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -748,7 +950,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -757,16 +959,16 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -775,7 +977,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -784,7 +986,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -793,7 +995,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -802,7 +1004,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -820,7 +1022,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -829,7 +1031,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -838,7 +1040,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -848,7 +1050,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_clks", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -857,7 +1059,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -866,7 +1068,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -875,7 +1077,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -884,7 +1086,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -893,7 +1095,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks", + "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -903,7 +1105,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -912,7 +1114,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -923,7 +1125,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -932,7 +1134,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -941,16 +1143,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -959,7 +1161,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -969,7 +1171,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -978,7 +1180,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -986,11 +1188,17 @@ }, { "BriefDescription": "This metric serves as an approximation of legacy x87 usage", - "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS", + "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_x87_use", "MetricThreshold": "tma_x87_use > 0.1", "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" } ] -- cgit v1.2.3 From 545dbda74dbcca9df8a90afcfd28f5224e548b35 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:55 -0700 Subject: perf vendor events intel: Update icelake/icelakex events/metrics Update icelake events to v1.18 including the new events MEM_LOAD_MISC_RETIRED.UC and SQ_MISC.BUS_LOCK. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/icelake/cache.json | 18 + .../pmu-events/arch/x86/icelake/icl-metrics.json | 950 +++++++------- .../pmu-events/arch/x86/icelakex/icx-metrics.json | 1306 ++++++++++++-------- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 4 files changed, 1276 insertions(+), 1000 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/icelake/cache.json b/tools/perf/pmu-events/arch/x86/icelake/cache.json index a9174a0837f0..79b9f02a4b63 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/cache.json +++ b/tools/perf/pmu-events/arch/x86/icelake/cache.json @@ -338,6 +338,16 @@ "SampleAfterValue": "100003", "UMask": "0x8" }, + { + "BriefDescription": "Retired instructions with at least 1 uncacheable load or Bus Lock.", + "Data_LA": "1", + "EventCode": "0xd4", + "EventName": "MEM_LOAD_MISC_RETIRED.UC", + "PEBS": "1", + "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock).", + "SampleAfterValue": "100007", + "UMask": "0x4" + }, { "BriefDescription": "Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.", "Data_LA": "1", @@ -833,6 +843,14 @@ "SampleAfterValue": "1000003", "UMask": "0x4" }, + { + "BriefDescription": "Counts bus locks, accounts for cache line split locks and UC locks.", + "EventCode": "0xF4", + "EventName": "SQ_MISC.BUS_LOCK", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", + "SampleAfterValue": "100003", + "UMask": "0x10" + }, { "BriefDescription": "Cycles the queue waiting for offcore responses is full.", "EventCode": "0xf4", diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json index ae8a96ec7fa5..20210742171d 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json @@ -64,7 +64,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -85,7 +85,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -94,7 +94,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -102,7 +102,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_slots", + "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -111,7 +111,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -131,7 +131,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", - "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", @@ -144,12 +144,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -167,7 +167,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -177,7 +177,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(29 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(29 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -197,7 +197,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "23.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -206,16 +206,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -225,7 +225,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -234,43 +234,43 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "32.5 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "32.5 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -279,11 +279,11 @@ }, { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -291,14 +291,14 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_slots", + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -327,7 +327,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -336,7 +336,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -345,7 +345,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -354,7 +354,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -363,7 +363,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_512b", "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -372,7 +372,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -392,7 +392,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -400,676 +400,676 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" + }, + { + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 5 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 5 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", + "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", + "MetricGroup": "Fed;LSD", + "MetricName": "tma_info_frontend_lsd_coverage" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_ntaken", - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_taken", - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_ret", - "MetricThreshold": "tma_info_ipmisp_ret < 500" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 11", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 11", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", - "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_instructions", + "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_inst_mix_instructions", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l3_miss_latency" + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" }, { - "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", - "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", - "MetricGroup": "Fed;LSD", - "MetricName": "tma_info_lsd_coverage" + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" }, { - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_other_branches" + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license0_utilization", + "MetricName": "tma_info_system_power_license0_utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license1_utilization", - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", + "MetricName": "tma_info_system_power_license1_utilization", + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license2_utilization", - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", + "MetricName": "tma_info_system_power_license2_utilization", + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, - { - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", - "MetricGroup": "SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_slots_utilization" - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "UNC_CLOCK.SOCKET", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" - }, - { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "TOPDOWN.SLOTS", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization" }, { "BriefDescription": "Uops Per Instruction", - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 7.5" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 7.5" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1078,7 +1078,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1088,7 +1088,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1097,7 +1097,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1106,20 +1106,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "9 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "9 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1134,7 +1134,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_clks)", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1151,7 +1151,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1160,7 +1160,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1169,10 +1169,10 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", - "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%" }, @@ -1188,20 +1188,20 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1225,7 +1225,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1234,28 +1234,28 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", - "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_clks", + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", "ScaleUnit": "100%" }, { @@ -1269,7 +1269,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1278,7 +1278,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -1297,7 +1297,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1306,7 +1306,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1315,7 +1315,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1324,7 +1324,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1333,7 +1333,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1342,7 +1342,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1351,7 +1351,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1360,7 +1360,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1369,7 +1369,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1378,7 +1378,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1388,7 +1388,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1397,7 +1397,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_clks", + "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1406,7 +1406,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1415,7 +1415,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1424,16 +1424,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1442,7 +1442,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1451,7 +1451,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1460,7 +1460,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1477,7 +1477,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1485,7 +1485,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1494,7 +1494,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "10 * BACLEARS.ANY / tma_info_clks", + "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index b736fec164d0..ef25cda019be 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -29,10 +29,243 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_2mb_large_page_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_2nd_level_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_2nd_level_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_CRD + UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF) / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to Intel(R) Optane(TM) Persistent Memory(PMEM) in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_to_pmem_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory write bandwidth (MB/sec) caused by directory updates; includes DDR and Intel(R) Optane(TM) Persistent Memory(PMEM).", + "MetricExpr": "(UNC_CHA_DIR_UPDATE.HA + UNC_CHA_DIR_UPDATE.TOR + UNC_M2M_DIRECTORY_UPDATE.ANY) * 64 / 1e6 / duration_time", + "MetricName": "memory_extra_write_bw_due_to_directory_updates", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_RPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_PMM_RPQ_INSERTS + UNC_M_PMM_WPQ_INSERTS) * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_WPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -48,9 +281,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -59,7 +298,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -67,7 +306,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_slots", + "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -76,7 +315,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -96,7 +335,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", - "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", @@ -109,12 +348,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -132,7 +371,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -142,7 +381,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 43.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 43.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -162,7 +401,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43.5 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "43.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -171,16 +410,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -190,7 +429,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound - tma_pmm_bound if #has_pmem > 0 else CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound)", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound - tma_pmm_bound if #has_pmem > 0 else CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -199,43 +438,43 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "48 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "48 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -244,11 +483,11 @@ }, { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -256,14 +495,14 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_slots", + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -292,7 +531,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -301,7 +540,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -310,7 +549,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -319,7 +558,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -328,7 +567,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_512b", "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -337,7 +576,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -357,7 +596,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -365,734 +604,741 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_core_ipmispredict", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" + }, + { + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", + "MetricName": "tma_info_core_ipmispredict", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 5 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 5 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_read_bw" - }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_write_bw" - }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_ntaken", - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_taken", - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_ret", - "MetricThreshold": "tma_info_ipmisp_ret < 500" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" - }, - { - "BriefDescription": "Instruction per taken branch", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 11", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 11", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" }, { - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" }, { - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_nonsilent_pki" + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_silent_pki" + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", - "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_instructions", + "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_inst_mix_instructions", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l3_miss_latency" + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" + }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" + }, + { + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_read_bw" + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_write_bw" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / cha_0@event\\=0x0@", "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_dram_read_latency", + "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, { "BriefDescription": "Average number of parallel data read requests to external memory", "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", + "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / cha_0@event\\=0x0@ if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_pmm_read_latency", + "MetricName": "tma_info_system_mem_pmm_read_latency", "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", + "MetricName": "tma_info_system_mem_read_latency", "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, - { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" - }, - { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" - }, - { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" - }, - { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_other_branches" - }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" - }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_read_bw" + "MetricName": "tma_info_system_pmm_read_bw" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_write_bw" + "MetricName": "tma_info_system_pmm_write_bw" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license0_utilization", + "MetricName": "tma_info_system_power_license0_utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license1_utilization", - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", + "MetricName": "tma_info_system_power_license1_utilization", + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license2_utilization", - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", + "MetricName": "tma_info_system_power_license2_utilization", + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, - { - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", - "MetricGroup": "SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_slots_utilization" - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cha_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" - }, - { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "TOPDOWN.SLOTS", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization" }, { "BriefDescription": "Uops Per Instruction", - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 7.5" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 7.5" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1101,7 +1347,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1111,7 +1357,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1120,7 +1366,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1129,20 +1375,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "19 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "19 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1157,7 +1403,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_clks)", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1174,7 +1420,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1182,7 +1428,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "43.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "43.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1192,7 +1438,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1211,20 +1457,20 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1248,7 +1494,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1257,28 +1503,28 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", - "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_clks", + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", "ScaleUnit": "100%" }, { @@ -1292,7 +1538,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1301,7 +1547,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -1320,7 +1566,7 @@ }, { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1329,7 +1575,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1338,7 +1584,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1347,7 +1593,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1356,7 +1602,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1365,7 +1611,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1374,7 +1620,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1383,7 +1629,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1392,7 +1638,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1401,7 +1647,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1410,7 +1656,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", - "MetricExpr": "(97 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 97 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(97 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 97 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1419,7 +1665,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "108 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "108 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1428,7 +1674,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1438,7 +1684,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1447,7 +1693,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "37 * MISC_RETIRED.PAUSE_INST / tma_info_clks", + "MetricExpr": "37 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1456,7 +1702,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1465,7 +1711,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1474,16 +1720,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1492,7 +1738,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1501,7 +1747,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1510,7 +1756,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1527,7 +1773,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1535,7 +1781,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1544,7 +1790,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "10 * BACLEARS.ANY / tma_info_clks", + "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -1587,5 +1833,17 @@ "MetricGroup": "transaction", "MetricName": "tsx_transactional_cycles", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_transmit_bw", + "ScaleUnit": "1MB/s" } ] diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index f3ae41e28ed2..1d2e63575da7 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -13,7 +13,7 @@ GenuineIntel-6-B6,v1.00,grandridge,core GenuineIntel-6-A[DE],v1.01,graniterapids,core GenuineIntel-6-(3C|45|46),v33,haswell,core GenuineIntel-6-3F,v27,haswellx,core -GenuineIntel-6-(7D|7E|A7),v1.17,icelake,core +GenuineIntel-6-(7D|7E|A7),v1.18,icelake,core GenuineIntel-6-6[AC],v1.20,icelakex,core GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v23,ivytown,core -- cgit v1.2.3 From b27d3ece5c9b70a87f68f0e52d68fb6eb828cbf8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:56 -0700 Subject: perf vendor events intel: Update ivybridge/ivytown metrics Metrics are updated to make TMA info metric names synchronized. Metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/ivybridge/ivb-metrics.json | 526 ++++++++++---------- .../pmu-events/arch/x86/ivytown/ivt-metrics.json | 534 ++++++++++----------- 2 files changed, 530 insertions(+), 530 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json index 11080ccffd51..33fe555252b2 100644 --- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -71,7 +71,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -81,7 +81,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -89,7 +89,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -109,7 +109,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -125,12 +125,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -150,7 +150,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) + 43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) + 43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -171,7 +171,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -180,7 +180,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -190,7 +190,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -199,25 +199,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -226,7 +226,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(7 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -235,7 +235,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE / tma_info_clks", + "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -245,11 +245,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -257,14 +257,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -301,7 +301,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -321,358 +321,358 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_clks - tma_itlb_misses", + "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_thread_clks - tma_itlb_misses", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" - }, - { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "1 / (tma_fp_scalar + tma_fp_vector)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Average number of parallel requests to external memory", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_parallel_requests", - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency" + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_clks", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_core_clks", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel requests to external memory", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_parallel_requests", + "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" + }, + { + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_request_latency" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "UNC_CLOCK.SOCKET", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -681,7 +681,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_clks, 0)", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -690,7 +690,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -700,7 +700,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -710,7 +710,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / tma_info_clks", + "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -719,11 +719,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -739,7 +739,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -749,7 +749,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -769,16 +769,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -788,7 +788,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -798,7 +798,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -807,16 +807,16 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -825,7 +825,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -834,7 +834,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -843,7 +843,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -852,7 +852,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -870,7 +870,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -880,7 +880,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_clks", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -889,7 +889,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -898,7 +898,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -907,7 +907,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -916,7 +916,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -924,7 +924,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -935,7 +935,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "13 * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -944,7 +944,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -953,16 +953,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -971,7 +971,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -981,7 +981,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -990,7 +990,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json index 65a46d659c0a..f5e46a768fdd 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -71,7 +71,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -81,7 +81,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -89,7 +89,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots", + "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -109,7 +109,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -125,12 +125,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -150,7 +150,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(60 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks", + "MetricExpr": "(60 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -171,7 +171,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "43 * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -180,7 +180,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -190,7 +190,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -199,25 +199,25 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -226,7 +226,7 @@ }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(7 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -235,7 +235,7 @@ }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_clks", + "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -245,11 +245,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -257,14 +257,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -301,7 +301,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -321,359 +321,359 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.", - "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_clks - tma_itlb_misses", + "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_thread_clks - tma_itlb_misses", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" - }, - { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "1 / (tma_fp_scalar + tma_fp_vector)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp" }, { "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "0", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" }, { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_socket_clks / duration_time)", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "0", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_clks", + "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_core_clks", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_system_socket_clks / duration_time)", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cbox_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -682,7 +682,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_clks, 0)", + "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -691,7 +691,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -701,7 +701,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -711,7 +711,7 @@ { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -720,11 +720,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -740,7 +740,7 @@ { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -750,7 +750,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -760,7 +760,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks", + "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -780,16 +780,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -799,7 +799,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -809,7 +809,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -818,16 +818,16 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -836,7 +836,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -845,7 +845,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -854,7 +854,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -863,7 +863,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -881,7 +881,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -891,7 +891,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_clks", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - (UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if tma_info_thread_ipc > 1.8 else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -900,7 +900,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -909,7 +909,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -918,7 +918,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -927,7 +927,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_clks", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -936,7 +936,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks", + "MetricExpr": "(200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -946,7 +946,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "310 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks", + "MetricExpr": "310 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -955,7 +955,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -966,7 +966,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "13 * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -975,7 +975,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -984,16 +984,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1002,7 +1002,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1012,7 +1012,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1021,7 +1021,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", -- cgit v1.2.3 From e08d2ae9bfc225edaea795e74c58934bdb91d27c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:57 -0700 Subject: perf vendor events intel: Update jaketown metrics Metrics are updated to make TMA info metric names synchronized. Metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/jaketown/jkt-metrics.json | 224 ++++++++++----------- 1 file changed, 112 insertions(+), 112 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json index 66a6f657bd6f..35b1a3aa728d 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -82,7 +82,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -98,12 +98,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -123,7 +123,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -133,7 +133,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -142,16 +142,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1", @@ -163,14 +163,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -207,7 +207,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -225,170 +225,170 @@ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "ScaleUnit": "100%" }, - { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" - }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" - }, - { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_lcp" }, { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_DISPATCHED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary;TmaL1;tma_L1_group", + "MetricName": "tma_info_inst_mix_instructions", + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" }, { - "BriefDescription": "Total number of retired Instructions", - "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth" }, { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" }, { "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "MetricName": "tma_info_system_kernel_cpi" }, { "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Average number of parallel data read requests to external memory", "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", + "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_socket_clks / duration_time)", + "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@filter_opc\\=0x182@) / (tma_info_system_socket_clks / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", + "MetricName": "tma_info_system_mem_read_latency", "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cbox_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_DISPATCHED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -398,7 +398,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -407,11 +407,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage", "ScaleUnit": "100%" }, { @@ -437,16 +437,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_dram_bw_use", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -456,7 +456,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -466,7 +466,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -475,7 +475,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -485,7 +485,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING)) / tma_info_clks", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -494,7 +494,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -504,7 +504,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", -- cgit v1.2.3 From 98f17fb413037393f3046fa5d9687ced1e4eb9c3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:58 -0700 Subject: perf vendor events intel: Update sandybridge metrics Metrics are updated to make TMA info metric names synchronized. Metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/sandybridge/snb-metrics.json | 222 ++++++++++----------- 1 file changed, 111 insertions(+), 111 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json index 4b8bc19392a4..8898b6fd0dea 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -82,7 +82,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -98,12 +98,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks", + "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -123,7 +123,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_clks", + "MetricExpr": "ARITH.FPU_DIV_ACTIVE / tma_info_core_core_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -133,7 +133,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -142,16 +142,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(7 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1", @@ -163,14 +163,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots", + "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -207,7 +207,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -225,169 +225,169 @@ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "ScaleUnit": "100%" }, - { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" - }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" - }, - { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_lcp" }, { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_DISPATCHED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary;TmaL1;tma_L1_group", + "MetricName": "tma_info_inst_mix_instructions", + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" }, { - "BriefDescription": "Total number of retired Instructions", - "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth" }, { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" }, { "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "MetricName": "tma_info_system_kernel_cpi" }, { "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Average number of parallel requests to external memory", "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_parallel_requests", + "MetricName": "tma_info_system_mem_parallel_requests", "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" }, { "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency" - }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "MetricName": "tma_info_system_mem_request_latency" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "UNC_CLOCK.SOCKET", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_DISPATCHED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks", + "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -397,7 +397,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", "MetricConstraint": "NO_GROUP_EVENTS_SMT", - "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks", + "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -406,11 +406,11 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage", "ScaleUnit": "100%" }, { @@ -436,16 +436,16 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_dram_bw_use", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -455,7 +455,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -465,7 +465,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -474,7 +474,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -484,7 +484,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING)) / tma_info_clks", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_DISPATCH) + cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=1@ - (cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_DISPATCHED.THREAD\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_L1D_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -493,7 +493,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -503,7 +503,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", -- cgit v1.2.3 From 9a5511eadea316f184940f92dfc5da1eecb0bfd9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:59 -0700 Subject: perf vendor events intel: Update sapphirerapids events/metrics Update sapphirerapids events to v1.13 improving event descriptions. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../pmu-events/arch/x86/sapphirerapids/memory.json | 6 +- .../arch/x86/sapphirerapids/spr-metrics.json | 1357 ++++++++++++-------- .../x86/sapphirerapids/uncore-interconnect.json | 2 +- .../arch/x86/sapphirerapids/uncore-memory.json | 8 +- 5 files changed, 823 insertions(+), 552 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 1d2e63575da7..59afd27feb1d 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -23,7 +23,7 @@ GenuineIntel-6-A[AC],v1.01,meteorlake,core GenuineIntel-6-1[AEF],v3,nehalemep,core GenuineIntel-6-2E,v3,nehalemex,core GenuineIntel-6-2A,v19,sandybridge,core -GenuineIntel-6-(8F|CF),v1.12,sapphirerapids,core +GenuineIntel-6-(8F|CF),v1.13,sapphirerapids,core GenuineIntel-6-AF,v1.00,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v55,skylake,core diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json index b72a36999930..e8bf7c9c44e1 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json @@ -32,18 +32,20 @@ "UMask": "0x3" }, { - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L2_MISS", + "BriefDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding.", "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5" }, { - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L3_MISS", + "BriefDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding.", "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 4308e2483112..4f3dd85540b6 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -29,10 +29,261 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_2mb_large_page_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_2nd_level_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_2nd_level_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA + UNC_CHA_TOR_INSERTS.IA_MISS_DRD + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF) / INST_RETIRED.ANY", + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to DRAM in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_to_dram_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to Intel(R) Optane(TM) Persistent Memory(PMEM) in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_to_pmem_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory write bandwidth (MB/sec) caused by directory updates; includes DDR and Intel(R) Optane(TM) Persistent Memory(PMEM).", + "MetricExpr": "(UNC_CHA_DIR_UPDATE.HA + UNC_CHA_DIR_UPDATE.TOR + UNC_M2M_DIRECTORY_UPDATE.ANY) * 64 / 1e6 / duration_time", + "MetricName": "memory_extra_write_bw_due_to_directory_updates", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_RPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_PMM_RPQ_INSERTS + UNC_M_PMM_WPQ_INSERTS) * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_WPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -48,9 +299,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -58,7 +315,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix Extensions (AMX) execution engine was busy with tile (arithmetic) operations", - "MetricExpr": "EXE.AMX_BUSY / tma_info_core_clks", + "MetricExpr": "EXE.AMX_BUSY / tma_info_core_core_clks", "MetricGroup": "Compute;HPC;Server;TopdownL5;tma_L5_group;tma_ports_utilized_0_group", "MetricName": "tma_amx_busy", "MetricThreshold": "tma_amx_busy > 0.5 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -66,7 +323,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_slots", + "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -75,7 +332,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.", - "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_slots", + "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_thread_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_avx_assists", "MetricThreshold": "tma_avx_assists > 0.1", @@ -83,7 +340,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -103,17 +360,17 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -131,7 +388,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -141,7 +398,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(76 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(76 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -161,7 +418,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "75.5 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "75.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -170,16 +427,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -189,7 +446,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks - tma_pmm_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks)", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks - tma_pmm_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -198,43 +455,43 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "80 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "80 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -243,11 +500,11 @@ }, { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -255,14 +512,14 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -281,7 +538,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine)", - "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;HPC;Pipeline;Server;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_fp_amx", "MetricThreshold": "tma_fp_amx > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -300,7 +557,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", - "MetricExpr": "30 * ASSISTS.FP / tma_info_slots", + "MetricExpr": "30 * ASSISTS.FP / tma_info_thread_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_fp_assists", "MetricThreshold": "tma_fp_assists > 0.1", @@ -309,7 +566,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -318,7 +575,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -327,7 +584,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -336,7 +593,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -345,7 +602,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_512b", "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -354,7 +611,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -364,7 +621,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", @@ -373,7 +630,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -383,7 +640,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_DATA.STALLS / tma_info_clks", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -391,754 +648,754 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" + }, + { + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 6 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_DATA.STALLS / cpu@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "tma_info_flopc / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_DATA.STALLS / cpu@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_write_bw" - }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16", "MetricGroup": "Flops;FpVector;InsType;Server", - "MetricName": "tma_info_iparith_amx_f16", - "MetricThreshold": "tma_info_iparith_amx_f16 < 10", + "MetricName": "tma_info_inst_mix_iparith_amx_f16", + "MetricThreshold": "tma_info_inst_mix_iparith_amx_f16 < 10", "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." }, { "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", "MetricGroup": "InsType;IntVector;Server", - "MetricName": "tma_info_iparith_amx_int8", - "MetricThreshold": "tma_info_iparith_amx_int8 < 10", + "MetricName": "tma_info_inst_mix_iparith_amx_int8", + "MetricThreshold": "tma_info_inst_mix_iparith_amx_int8 < 10", "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, - { - "BriefDescription": "Instructions per a microcode Assist invocation", - "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", - "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_ipassist", - "MetricThreshold": "tma_info_ipassist < 100e3", - "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" - }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_flopc", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_flopc", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_ntaken", - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_taken", - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_ret", - "MetricThreshold": "tma_info_ipmisp_ret < 500" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 13", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 13", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" - }, - { - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_nonsilent_pki" - }, - { - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_silent_pki" + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l3_miss_latency" + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" + }, + { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" + }, + { + "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", + "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_strings_cycles", + "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1" + }, + { + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "tma_info_core_flopc / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_write_bw" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_dram_read_latency", + "MetricName": "tma_info_system_mem_dram_read_latency", "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, { "BriefDescription": "Average number of parallel data read requests to external memory", "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", + "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_pmm_read_latency", + "MetricName": "tma_info_system_mem_pmm_read_latency", "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", + "MetricName": "tma_info_system_mem_read_latency", "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, - { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" - }, - { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" - }, - { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" - }, - { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" - }, - { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" - }, - { - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_other_branches" - }, - { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" - }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_read_bw" + "MetricName": "tma_info_system_pmm_read_bw" }, { "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_write_bw" - }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, - { - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", - "MetricGroup": "SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_slots_utilization" + "MetricName": "tma_info_system_pmm_write_bw" }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "uncore_cha_0@event\\=0x1@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" - }, - { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" - }, - { - "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", - "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_strings_cycles", - "MetricThreshold": "tma_info_strings_cycles > 0.1" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Tera Integer (matrix) Operations Per Second", "MetricExpr": "8 * AMX_OPS_RETIRED.INT8 / 1e12 / duration_time", "MetricGroup": "Cor;HPC;IntVector;Server", - "MetricName": "tma_info_tiops" + "MetricName": "tma_info_system_tiops" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" - }, - { - "BriefDescription": "Uops Per Instruction", - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", - "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_system_turbo_utilization" }, { "BriefDescription": "Cross-socket Ultra Path Interconnect (UPI) data transmit bandwidth for data only [MB / sec]", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 64 / 9 / 1e6", "MetricGroup": "Server;SoC", - "MetricName": "tma_info_upi_data_transmit_bw" + "MetricName": "tma_info_system_upi_data_transmit_bw" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "TOPDOWN.SLOTS", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 9" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 9" }, { "BriefDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine)", - "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;HPC;IntVector;Pipeline;Server;TopdownL4;tma_L4_group;tma_int_operations_group", "MetricName": "tma_int_amx", "MetricThreshold": "tma_int_amx > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1156,7 +1413,7 @@ }, { "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_128b", "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1165,7 +1422,7 @@ }, { "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_256b", "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1174,7 +1431,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_TAG.STALLS / tma_info_clks", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1183,7 +1440,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1193,7 +1450,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1202,7 +1459,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1211,20 +1468,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "33 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "33 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "DECODE.LCP / tma_info_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1239,7 +1496,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_clks)", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1256,7 +1513,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1264,7 +1521,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "71 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "71 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1274,7 +1531,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1293,7 +1550,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to memory bandwidth Allocation feature (RDT's memory bandwidth throttling).", - "MetricExpr": "INT_MISC.MBA_STALLS / tma_info_clks", + "MetricExpr": "INT_MISC.MBA_STALLS / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;Server;TopdownL5;tma_L5_group;tma_mem_bandwidth_group", "MetricName": "tma_mba_stalls", "MetricThreshold": "tma_mba_stalls > 0.1 & (tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1301,25 +1558,25 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -1329,7 +1586,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_clks", + "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1338,7 +1595,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6", @@ -1346,7 +1603,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.MS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1355,25 +1612,25 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", - "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_clks", + "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_thread_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", @@ -1382,7 +1639,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_slots / UOPS_ISSUED.ANY) / tma_info_clks", + "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1391,7 +1648,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", - "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", @@ -1400,7 +1657,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -1419,7 +1676,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", - "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_slots", + "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_page_faults", "MetricThreshold": "tma_page_faults > 0.05", @@ -1428,7 +1685,7 @@ }, { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1437,7 +1694,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1446,7 +1703,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1455,7 +1712,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1464,7 +1721,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_clks)", + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1473,7 +1730,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_clks", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1482,7 +1739,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1491,7 +1748,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1500,7 +1757,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1509,7 +1766,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", - "MetricExpr": "(135.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1518,7 +1775,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "149 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "149 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1527,7 +1784,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1537,7 +1794,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1546,7 +1803,7 @@ }, { "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", - "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_slots)", + "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group", "MetricName": "tma_shuffles", "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1554,7 +1811,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_clks", + "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1563,7 +1820,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1572,7 +1829,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1581,16 +1838,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_clks", + "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1599,7 +1856,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1608,7 +1865,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1617,7 +1874,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1634,7 +1891,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1642,7 +1899,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1651,7 +1908,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_clks", + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -1694,5 +1951,17 @@ "MetricGroup": "transaction", "MetricName": "tsx_transactional_cycles", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_transmit_bw", + "ScaleUnit": "1MB/s" } ] diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json index 08faf38115d9..6800de05c836 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json @@ -464,7 +464,7 @@ "Unit": "M2M" }, { - "BriefDescription": "Counts the time when FM didn? do d2c for fill reads (cross tile case)", + "BriefDescription": "Counts the time when FM didn't do d2c for fill reads (cross tile case)", "EventCode": "0x4a", "EventName": "UNC_M2M_DIRECT2CORE_NOT_TAKEN_NOTFORKED", "PerPkg": "1", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json index 225333561295..3ff9e9b722c8 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json @@ -2480,11 +2480,11 @@ "Unit": "iMC" }, { - "BriefDescription": "DRAM Precharge commands. : Precharge due to (?)", + "BriefDescription": "DRAM Precharge commands", "EventCode": "0x03", "EventName": "UNC_M_PRE_COUNT.PGT", "PerPkg": "1", - "PublicDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.", + "PublicDescription": "DRAM Precharge commands. Counts the number of DRAM Precharge commands sent on this channel.", "UMask": "0x88", "Unit": "iMC" }, @@ -3236,7 +3236,7 @@ "Unit": "iMC" }, { - "BriefDescription": "2LM Tag check hit due to memory read (bug?)", + "BriefDescription": "2LM Tag check hit due to memory read", "EventCode": "0xd3", "EventName": "UNC_M_TAGCHK.NM_RD_HIT", "PerPkg": "1", @@ -3244,7 +3244,7 @@ "Unit": "iMC" }, { - "BriefDescription": "2LM Tag check hit due to memory write (bug?)", + "BriefDescription": "2LM Tag check hit due to memory write", "EventCode": "0xd3", "EventName": "UNC_M_TAGCHK.NM_WR_HIT", "PerPkg": "1", -- cgit v1.2.3 From b522c8aff810b06810d7791f1ece07758ed26194 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:38:00 -0700 Subject: perf vendor events intel: Update skylake/skylakex events/metrics Update skylake events to v60 and skylakex events to v1.30, adding the events FP_ARITH_INST_RETIRED.4_FLOPS, FP_ARITH_INST_RETIRED.8_FLOPS, FP_ARITH_INST_RETIRED.SCALAR, FP_ARITH_INST_RETIRED.VECTOR and INT_MISC.CLEARS_COUNT. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 4 +- .../arch/x86/skylake/floating-point.json | 8 + .../perf/pmu-events/arch/x86/skylake/pipeline.json | 15 +- .../pmu-events/arch/x86/skylake/skl-metrics.json | 875 ++++++++------- .../arch/x86/skylakex/floating-point.json | 31 + .../pmu-events/arch/x86/skylakex/pipeline.json | 23 +- .../pmu-events/arch/x86/skylakex/skx-metrics.json | 1183 ++++++++++++-------- 7 files changed, 1219 insertions(+), 920 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 59afd27feb1d..4731a92af9f9 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -26,8 +26,8 @@ GenuineIntel-6-2A,v19,sandybridge,core GenuineIntel-6-(8F|CF),v1.13,sapphirerapids,core GenuineIntel-6-AF,v1.00,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core -GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v55,skylake,core -GenuineIntel-6-55-[01234],v1.29,skylakex,core +GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v56,skylake,core +GenuineIntel-6-55-[01234],v1.30,skylakex,core GenuineIntel-6-86,v1.20,snowridgex,core GenuineIntel-6-8[CD],v1.10,tigerlake,core GenuineIntel-6-2C,v4,westmereep-dp,core diff --git a/tools/perf/pmu-events/arch/x86/skylake/floating-point.json b/tools/perf/pmu-events/arch/x86/skylake/floating-point.json index 4d494a5cabbf..5891bd74af60 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/skylake/floating-point.json @@ -31,6 +31,14 @@ "SampleAfterValue": "2000003", "UMask": "0x20" }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x18" + }, { "BriefDescription": "Counts once for most SIMD scalar computational floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", "EventCode": "0xC7", diff --git a/tools/perf/pmu-events/arch/x86/skylake/pipeline.json b/tools/perf/pmu-events/arch/x86/skylake/pipeline.json index 2dfc3af08eff..cc800fb8180a 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/skylake/pipeline.json @@ -26,12 +26,21 @@ "UMask": "0x4" }, { - "BriefDescription": "Conditional branch instructions retired.", + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", + "Errata": "SKL091", + "EventCode": "0xC4", + "EventName": "BR_INST_RETIRED.COND", + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", + "SampleAfterValue": "400009", + "UMask": "0x1" + }, + { + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", "Errata": "SKL091", "EventCode": "0xC4", "EventName": "BR_INST_RETIRED.CONDITIONAL", "PEBS": "1", - "PublicDescription": "This event counts conditional branch instructions retired.", + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", "SampleAfterValue": "400009", "UMask": "0x1" }, @@ -405,9 +414,9 @@ "UMask": "0x1" }, { - "AnyThread": "1", "BriefDescription": "Clears speculative count", "CounterMask": "1", + "EdgeDetect": "1", "EventCode": "0x0D", "EventName": "INT_MISC.CLEARS_COUNT", "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index 21ef6c9be816..2ed88842b880 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -50,7 +50,7 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, @@ -71,7 +71,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -80,7 +80,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -88,7 +88,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_slots", + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -97,7 +97,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -107,7 +107,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -123,12 +123,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -146,7 +146,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -156,7 +156,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(18.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(18.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -177,7 +177,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "16.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -186,16 +186,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -205,7 +205,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -214,45 +214,45 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "22 * tma_info_average_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "22 * tma_info_system_average_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -262,11 +262,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -274,14 +274,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -347,7 +347,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -366,7 +366,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_slots", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_thread_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -376,7 +376,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_clks", + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -384,220 +384,231 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { @@ -605,416 +616,404 @@ "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" + }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" + }, + { + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" }, { "BriefDescription": "Average number of parallel data read requests to external memory", "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_OCCUPANCY.DATA_READ@thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", + "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { "BriefDescription": "Average number of parallel requests to external memory", "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_parallel_requests", + "MetricName": "tma_info_system_mem_parallel_requests", "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_REQUESTS.DATA_READ) / (tma_info_socket_clks / duration_time)", + "MetricExpr": "1e9 * (UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_REQUESTS.DATA_READ) / (tma_info_system_socket_clks / duration_time)", "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", + "MetricName": "tma_info_system_mem_read_latency", "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency" + "MetricName": "tma_info_system_mem_request_latency" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", + "MetricGroup": "SMT", + "MetricName": "tma_info_system_smt_2t_utilization" }, { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "BriefDescription": "Socket actual clocks when any core is active on that socket", + "MetricExpr": "UNC_CLOCK.SOCKET", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_socket_clks" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" }, { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." }, { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" }, { "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", + "MetricExpr": "4 * tma_info_core_core_clks", "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, - { - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", - "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" - }, - { - "BriefDescription": "Socket actual clocks when any core is active on that socket", - "MetricExpr": "UNC_CLOCK.SOCKET", - "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" - }, - { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" - }, - { - "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", - "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1023,7 +1022,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1033,7 +1032,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1042,7 +1041,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1051,20 +1050,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "6.5 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "6.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1079,7 +1078,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1097,7 +1096,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1105,7 +1104,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", - "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (9 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (9 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1125,20 +1124,20 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1162,7 +1161,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1171,19 +1170,19 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1198,7 +1197,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1234,7 +1233,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1243,7 +1242,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1252,7 +1251,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -1261,7 +1260,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -1279,7 +1278,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1288,7 +1287,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1297,7 +1296,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -1306,7 +1305,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", + "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1315,7 +1314,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_clks", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1324,7 +1323,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_clks", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1333,7 +1332,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_clks", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1342,7 +1341,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_clks", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1350,7 +1349,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1360,7 +1359,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1370,7 +1369,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1379,7 +1378,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1388,16 +1387,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1406,7 +1405,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1416,7 +1415,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1425,7 +1424,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1441,7 +1440,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1449,7 +1448,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "9 * BACLEARS.ANY / tma_info_clks", + "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json b/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json index 64dd36387209..384b3c551a1f 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json @@ -31,6 +31,14 @@ "SampleAfterValue": "2000003", "UMask": "0x20" }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x18" + }, { "BriefDescription": "Counts number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "EventCode": "0xC7", @@ -47,6 +55,22 @@ "SampleAfterValue": "2000003", "UMask": "0x80" }, + { + "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.8_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x18" + }, + { + "BriefDescription": "Counts once for most SIMD scalar computational floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.SCALAR", + "PublicDescription": "Counts once for most SIMD scalar computational single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SIMD scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "2000003", + "UMask": "0x3" + }, { "BriefDescription": "Counts once for most SIMD scalar computational double precision floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", "EventCode": "0xC7", @@ -63,6 +87,13 @@ "SampleAfterValue": "2000003", "UMask": "0x2" }, + { + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", + "EventCode": "0xC7", + "EventName": "FP_ARITH_INST_RETIRED.VECTOR", + "SampleAfterValue": "2000003", + "UMask": "0xfc" + }, { "BriefDescription": "Cycles with any input/output SSE or FP assist", "CounterMask": "1", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json index 0f06e314fe36..31a1663d57f8 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json @@ -26,12 +26,21 @@ "UMask": "0x4" }, { - "BriefDescription": "Conditional branch instructions retired.", + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", + "Errata": "SKL091", + "EventCode": "0xC4", + "EventName": "BR_INST_RETIRED.COND", + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", + "SampleAfterValue": "400009", + "UMask": "0x1" + }, + { + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", "Errata": "SKL091", "EventCode": "0xC4", "EventName": "BR_INST_RETIRED.CONDITIONAL", "PEBS": "1", - "PublicDescription": "This event counts conditional branch instructions retired.", + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", "SampleAfterValue": "400009", "UMask": "0x1" }, @@ -413,6 +422,16 @@ "SampleAfterValue": "2000003", "UMask": "0x1" }, + { + "BriefDescription": "Clears speculative count", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0x0D", + "EventName": "INT_MISC.CLEARS_COUNT", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", + "SampleAfterValue": "2000003", + "UMask": "0x1" + }, { "BriefDescription": "Cycles the issue-stage is waiting for front-end to fetch from resteered path following branch misprediction or machine clear events.", "EventCode": "0x0D", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index eb6f12c0343d..507d39efacc8 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -50,10 +50,219 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "dtlb_2mb_large_page_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "(UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART0 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART1 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART2 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART3) * 4 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12CC0233@ / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40433@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40433@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40432@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40431@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12D40433@ / INST_RETIRED.ANY", + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / UOPS_ISSUED.ANY", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -69,9 +278,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -80,7 +295,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -88,7 +303,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_slots", + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -97,7 +312,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -107,7 +322,7 @@ }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -123,12 +338,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -146,7 +361,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -156,7 +371,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -177,7 +392,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -186,16 +401,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -205,7 +420,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -214,45 +429,45 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(110 * tma_info_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_clks", + "MetricExpr": "(110 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -262,11 +477,11 @@ { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -274,14 +489,14 @@ "MetricExpr": "tma_frontend_bound - tma_fetch_latency", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -356,7 +571,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -375,7 +590,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_slots", + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_thread_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -385,7 +600,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_clks", + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -393,686 +608,692 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_core_ipmispredict", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", + "MetricName": "tma_info_core_ipmispredict", + "MetricgroupNoGroup": "TopdownL1" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", - "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_read_bw" - }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_write_bw" - }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { - "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", - "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", + "MetricGroup": "InsType", + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 9", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 9", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" - }, - { - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_nonsilent_pki" - }, - { - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_silent_pki" + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" }, { - "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", - "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", - "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_dram_read_latency", - "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_read_bw" }, { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_write_bw" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", + "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", + "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricName": "tma_info_system_mem_dram_read_latency", + "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks)", + "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks)", "MetricGroup": "Power", - "MetricName": "tma_info_power_license0_utilization", + "MetricName": "tma_info_system_power_license0_utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks)", + "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks)", "MetricGroup": "Power", - "MetricName": "tma_info_power_license1_utilization", - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", + "MetricName": "tma_info_system_power_license1_utilization", + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks)", + "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks)", "MetricGroup": "Power", - "MetricName": "tma_info_power_license2_utilization", - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", + "MetricName": "tma_info_system_power_license2_utilization", + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "4 * tma_info_core_clks", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, { "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "MetricName": "tma_info_system_smt_2t_utilization" }, { "BriefDescription": "Socket actual clocks when any core is active on that socket", "MetricExpr": "cha_0@event\\=0x0@", "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" - }, - { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "MetricName": "tma_info_system_turbo_utilization" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "4 * tma_info_core_core_clks", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" }, { "BriefDescription": "Uops Per Instruction", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 6" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 6" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1081,7 +1302,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1091,7 +1312,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1100,7 +1321,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1109,20 +1330,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "17 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1137,7 +1358,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1155,7 +1376,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1163,7 +1384,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "59.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1172,7 +1393,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", - "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1192,20 +1413,20 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1229,7 +1450,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1238,19 +1459,19 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, @@ -1265,7 +1486,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1301,7 +1522,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1310,7 +1531,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1319,7 +1540,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_2", "MetricThreshold": "tma_port_2 > 0.6", @@ -1328,7 +1549,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", "MetricName": "tma_port_3", "MetricThreshold": "tma_port_3 > 0.6", @@ -1346,7 +1567,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1355,7 +1576,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1364,7 +1585,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", "MetricName": "tma_port_7", "MetricThreshold": "tma_port_7 > 0.6", @@ -1373,7 +1594,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", + "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1382,7 +1603,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_clks", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1391,7 +1612,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_clks", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1400,7 +1621,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_clks", + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1409,7 +1630,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_clks", + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1418,7 +1639,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(89.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1427,7 +1648,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "127 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1436,7 +1657,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1446,7 +1667,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1456,7 +1677,7 @@ { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1465,7 +1686,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1474,16 +1695,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1492,7 +1713,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1502,7 +1723,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", "MetricConstraint": "NO_GROUP_EVENTS_NMI", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1511,7 +1732,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1527,7 +1748,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1535,7 +1756,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "9 * BACLEARS.ANY / tma_info_clks", + "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -1578,5 +1799,17 @@ "MetricGroup": "transaction", "MetricName": "tsx_transactional_cycles", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_transmit_bw", + "ScaleUnit": "1MB/s" } ] -- cgit v1.2.3 From d97b82aead504a631033ebbf49cbe104dc603926 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:38:01 -0700 Subject: perf vendor events intel: Update snowridgex events Update snowridgex to v1.21 that marks deprecated a number of events and adds improves descriptions. The events data was generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- tools/perf/pmu-events/arch/x86/snowridgex/cache.json | 7 +++++++ tools/perf/pmu-events/arch/x86/snowridgex/memory.json | 2 ++ tools/perf/pmu-events/arch/x86/snowridgex/other.json | 10 ++++++++++ tools/perf/pmu-events/arch/x86/snowridgex/pipeline.json | 3 +++ .../arch/x86/snowridgex/uncore-interconnect.json | 14 +++++++------- tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json | 8 -------- .../perf/pmu-events/arch/x86/snowridgex/uncore-memory.json | 7 +++---- .../perf/pmu-events/arch/x86/snowridgex/uncore-power.json | 6 +++--- 9 files changed, 36 insertions(+), 23 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 4731a92af9f9..4a1a2b8d6201 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -28,7 +28,7 @@ GenuineIntel-6-AF,v1.00,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v56,skylake,core GenuineIntel-6-55-[01234],v1.30,skylakex,core -GenuineIntel-6-86,v1.20,snowridgex,core +GenuineIntel-6-86,v1.21,snowridgex,core GenuineIntel-6-8[CD],v1.10,tigerlake,core GenuineIntel-6-2C,v4,westmereep-dp,core GenuineIntel-6-25,v3,westmereep-sp,core diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/cache.json b/tools/perf/pmu-events/arch/x86/snowridgex/cache.json index 0ab90e3bf76b..c6be60584522 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/cache.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/cache.json @@ -72,6 +72,7 @@ "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).", "EventCode": "0x34", "EventName": "MEM_BOUND_STALLS.IFETCH", + "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or translation lookaside buffer (TLB) miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).", "SampleAfterValue": "200003", "UMask": "0x38" }, @@ -437,6 +438,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT", "MSRIndex": "0x1a6,0x1a7", @@ -446,6 +448,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_HITM", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", "MSRIndex": "0x1a6,0x1a7", @@ -455,6 +458,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_HIT_NO_FWD", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD", "MSRIndex": "0x1a6,0x1a7", @@ -464,6 +468,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_HIT_WITH_FWD", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", "MSRIndex": "0x1a6,0x1a7", @@ -473,6 +478,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_MISS", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -482,6 +488,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_HIT.SNOOP_NOT_NEEDED", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_NOT_NEEDED", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/memory.json b/tools/perf/pmu-events/arch/x86/snowridgex/memory.json index 18621909d1a9..c02eb0e836ad 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/memory.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/memory.json @@ -96,6 +96,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_MISS", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -105,6 +106,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.L3_MISS_LOCAL", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/other.json b/tools/perf/pmu-events/arch/x86/snowridgex/other.json index 00ae180ded25..fefbc383b840 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/other.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/other.json @@ -1,6 +1,7 @@ [ { "BriefDescription": "This event is deprecated. Refer to new event BUS_LOCK.SELF_LOCKS", + "Deprecated": "1", "EdgeDetect": "1", "EventCode": "0x63", "EventName": "BUS_LOCK.ALL", @@ -16,6 +17,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event BUS_LOCK.BLOCK_CYCLES", + "Deprecated": "1", "EventCode": "0x63", "EventName": "BUS_LOCK.CYCLES_OTHER_BLOCK", "SampleAfterValue": "200003", @@ -23,6 +25,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event BUS_LOCK.LOCK_CYCLES", + "Deprecated": "1", "EventCode": "0x63", "EventName": "BUS_LOCK.CYCLES_SELF_BLOCK", "SampleAfterValue": "200003", @@ -46,6 +49,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event MEM_BOUND_STALLS.LOAD_DRAM_HIT", + "Deprecated": "1", "EventCode": "0x34", "EventName": "C0_STALLS.LOAD_DRAM_HIT", "SampleAfterValue": "200003", @@ -53,6 +57,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event MEM_BOUND_STALLS.LOAD_L2_HIT", + "Deprecated": "1", "EventCode": "0x34", "EventName": "C0_STALLS.LOAD_L2_HIT", "SampleAfterValue": "200003", @@ -60,6 +65,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event MEM_BOUND_STALLS.LOAD_LLC_HIT", + "Deprecated": "1", "EventCode": "0x34", "EventName": "C0_STALLS.LOAD_LLC_HIT", "SampleAfterValue": "200003", @@ -207,6 +213,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.ANY_RESPONSE", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -216,6 +223,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.DRAM", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -225,6 +233,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.LOCAL_DRAM", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.LOCAL_DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -234,6 +243,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event OCR.DEMAND_DATA_AND_L1PF_RD.OUTSTANDING", + "Deprecated": "1", "EventCode": "0XB7", "EventName": "OCR.DEMAND_DATA_RD.OUTSTANDING", "MSRIndex": "0x1a6", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/pipeline.json b/tools/perf/pmu-events/arch/x86/snowridgex/pipeline.json index 9dd8c909facc..c483c0838e08 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/pipeline.json @@ -165,6 +165,7 @@ }, { "BriefDescription": "This event is deprecated.", + "Deprecated": "1", "EventCode": "0xcd", "EventName": "CYCLES_DIV_BUSY.ANY", "SampleAfterValue": "2000003" @@ -283,6 +284,7 @@ }, { "BriefDescription": "This event is deprecated. Refer to new event TOPDOWN_BAD_SPECULATION.FASTNUKE", + "Deprecated": "1", "EventCode": "0x73", "EventName": "TOPDOWN_BAD_SPECULATION.MONUKE", "SampleAfterValue": "1000003", @@ -338,6 +340,7 @@ }, { "BriefDescription": "This event is deprecated.", + "Deprecated": "1", "EventCode": "0x74", "EventName": "TOPDOWN_BE_BOUND.STORE_BUFFER", "SampleAfterValue": "1000003", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json index de3840078e21..7e2895f7fe3d 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-interconnect.json @@ -590,7 +590,7 @@ "EventCode": "0x0C", "EventName": "UNC_I_TxS_REQUEST_OCCUPANCY", "PerPkg": "1", - "PublicDescription": "Outbound Request Queue Occupancy : Accumultes the number of outstanding outbound requests from the IRP to the switch (towards the devices). This can be used in conjuection with the allocations event in order to calculate average latency of outbound requests.", + "PublicDescription": "Outbound Request Queue Occupancy : Accumulates the number of outstanding outbound requests from the IRP to the switch (towards the devices). This can be used in conjunction with the allocations event in order to calculate average latency of outbound requests.", "Unit": "IRP" }, { @@ -5570,7 +5570,7 @@ "Unit": "M2M" }, { - "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Regular : Channel 0", + "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Regular : Channel 0", "EventCode": "0x4D", "EventName": "UNC_M2M_WPQ_NO_REG_CRD.CHN0", "PerPkg": "1", @@ -5578,7 +5578,7 @@ "Unit": "M2M" }, { - "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Regular : Channel 1", + "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Regular : Channel 1", "EventCode": "0x4D", "EventName": "UNC_M2M_WPQ_NO_REG_CRD.CHN1", "PerPkg": "1", @@ -5586,7 +5586,7 @@ "Unit": "M2M" }, { - "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Regular : Channel 2", + "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Regular : Channel 2", "EventCode": "0x4D", "EventName": "UNC_M2M_WPQ_NO_REG_CRD.CHN2", "PerPkg": "1", @@ -5594,7 +5594,7 @@ "Unit": "M2M" }, { - "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Special : Channel 0", + "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Special : Channel 0", "EventCode": "0x4E", "EventName": "UNC_M2M_WPQ_NO_SPEC_CRD.CHN0", "PerPkg": "1", @@ -5602,7 +5602,7 @@ "Unit": "M2M" }, { - "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Special : Channel 1", + "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Special : Channel 1", "EventCode": "0x4E", "EventName": "UNC_M2M_WPQ_NO_SPEC_CRD.CHN1", "PerPkg": "1", @@ -5610,7 +5610,7 @@ "Unit": "M2M" }, { - "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Special : Channel 2", + "BriefDescription": "M2M->iMC WPQ Cycles w/Credits - Special : Channel 2", "EventCode": "0x4E", "EventName": "UNC_M2M_WPQ_NO_SPEC_CRD.CHN2", "PerPkg": "1", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json index 996028071ee4..ecdd6f0f8e8f 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-io.json @@ -34,7 +34,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART0_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART0_FREERUN", "UMask": "0x20", "Unit": "iio_free_running" }, @@ -43,7 +42,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART1_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART1_FREERUN", "UMask": "0x21", "Unit": "iio_free_running" }, @@ -52,7 +50,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART2_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART2_FREERUN", "UMask": "0x22", "Unit": "iio_free_running" }, @@ -61,7 +58,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART3_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART3_FREERUN", "UMask": "0x23", "Unit": "iio_free_running" }, @@ -70,7 +66,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART4_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART4_FREERUN", "UMask": "0x24", "Unit": "iio_free_running" }, @@ -79,7 +74,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART5_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART5_FREERUN", "UMask": "0x25", "Unit": "iio_free_running" }, @@ -88,7 +82,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART6_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART6_FREERUN", "UMask": "0x26", "Unit": "iio_free_running" }, @@ -97,7 +90,6 @@ "EventCode": "0xff", "EventName": "UNC_IIO_BANDWIDTH_IN.PART7_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_IIO_BANDWIDTH_IN.PART7_FREERUN", "UMask": "0x27", "Unit": "iio_free_running" }, diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-memory.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-memory.json index 530e9b71b92a..b80911d498dd 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-memory.json @@ -130,7 +130,6 @@ "EventCode": "0xff", "EventName": "UNC_M_CLOCKTICKS_FREERUN", "PerPkg": "1", - "PublicDescription": "UNC_M_CLOCKTICKS_FREERUN", "UMask": "0x10", "Unit": "imc_free_running" }, @@ -322,7 +321,7 @@ "EventCode": "0x02", "EventName": "UNC_M_PRE_COUNT.PGT", "PerPkg": "1", - "PublicDescription": "DRAM Precharge commands. : Precharge due to page table : Counts the number of DRAM Precharge commands sent on this channel. : Prechages from Page Table", + "PublicDescription": "DRAM Precharge commands. : Precharge due to page table : Counts the number of DRAM Precharge commands sent on this channel. : Precharges from Page Table", "UMask": "0x10", "Unit": "iMC" }, @@ -497,7 +496,7 @@ "EventCode": "0x82", "EventName": "UNC_M_WPQ_OCCUPANCY_PCH0", "PerPkg": "1", - "PublicDescription": "Write Pending Queue Occupancy : Accumulates the occupancies of the Write Pending Queue each cycle. This can then be used to calculate both the average queue occupancy (in conjunction with the number of cycles not empty) and the average latency (in conjunction with the number of allocations). The WPQ is used to schedule write out to the memory controller and to track the writes. Requests allocate into the WPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after being issued to DRAM. Write requests themselves are able to complete (from the perspective of the rest of the system) as soon they have posted to the iMC. This is not to be confused with actually performing the write to DRAM. Therefore, the average latency for this queue is actually not useful for deconstruction intermediate write latencies. So, we provide filtering based on if the request has posted or not. By using the not posted filter, we can track how long writes spent in the iMC before completions were sent to the HA. The posted filter, on the other hand, provides information about how much queueing is actually happenning in the iMC for writes before they are actually issued to memory. High average occupancies will generally coincide with high write major mode counts.", + "PublicDescription": "Write Pending Queue Occupancy : Accumulates the occupancies of the Write Pending Queue each cycle. This can then be used to calculate both the average queue occupancy (in conjunction with the number of cycles not empty) and the average latency (in conjunction with the number of allocations). The WPQ is used to schedule write out to the memory controller and to track the writes. Requests allocate into the WPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after being issued to DRAM. Write requests themselves are able to complete (from the perspective of the rest of the system) as soon they have posted to the iMC. This is not to be confused with actually performing the write to DRAM. Therefore, the average latency for this queue is actually not useful for deconstruction intermediate write latencies. So, we provide filtering based on if the request has posted or not. By using the not posted filter, we can track how long writes spent in the iMC before completions were sent to the HA. The posted filter, on the other hand, provides information about how much queueing is actually happening in the iMC for writes before they are actually issued to memory. High average occupancies will generally coincide with high write major mode counts.", "Unit": "iMC" }, { @@ -505,7 +504,7 @@ "EventCode": "0x83", "EventName": "UNC_M_WPQ_OCCUPANCY_PCH1", "PerPkg": "1", - "PublicDescription": "Write Pending Queue Occupancy : Accumulates the occupancies of the Write Pending Queue each cycle. This can then be used to calculate both the average queue occupancy (in conjunction with the number of cycles not empty) and the average latency (in conjunction with the number of allocations). The WPQ is used to schedule write out to the memory controller and to track the writes. Requests allocate into the WPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after being issued to DRAM. Write requests themselves are able to complete (from the perspective of the rest of the system) as soon they have posted to the iMC. This is not to be confused with actually performing the write to DRAM. Therefore, the average latency for this queue is actually not useful for deconstruction intermediate write latencies. So, we provide filtering based on if the request has posted or not. By using the not posted filter, we can track how long writes spent in the iMC before completions were sent to the HA. The posted filter, on the other hand, provides information about how much queueing is actually happenning in the iMC for writes before they are actually issued to memory. High average occupancies will generally coincide with high write major mode counts.", + "PublicDescription": "Write Pending Queue Occupancy : Accumulates the occupancies of the Write Pending Queue each cycle. This can then be used to calculate both the average queue occupancy (in conjunction with the number of cycles not empty) and the average latency (in conjunction with the number of allocations). The WPQ is used to schedule write out to the memory controller and to track the writes. Requests allocate into the WPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC. They deallocate after being issued to DRAM. Write requests themselves are able to complete (from the perspective of the rest of the system) as soon they have posted to the iMC. This is not to be confused with actually performing the write to DRAM. Therefore, the average latency for this queue is actually not useful for deconstruction intermediate write latencies. So, we provide filtering based on if the request has posted or not. By using the not posted filter, we can track how long writes spent in the iMC before completions were sent to the HA. The posted filter, on the other hand, provides information about how much queueing is actually happening in the iMC for writes before they are actually issued to memory. High average occupancies will generally coincide with high write major mode counts.", "Unit": "iMC" }, { diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json index 27fc155f1223..a61ffca2dfea 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json @@ -149,7 +149,7 @@ "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0", "PerPkg": "1", - "PublicDescription": "Number of cores in C-State : C0 and C1 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "PublicDescription": "Number of cores in C-State : C0 and C1 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" }, { @@ -157,7 +157,7 @@ "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3", "PerPkg": "1", - "PublicDescription": "Number of cores in C-State : C3 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "PublicDescription": "Number of cores in C-State : C3 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" }, { @@ -165,7 +165,7 @@ "EventCode": "0x80", "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6", "PerPkg": "1", - "PublicDescription": "Number of cores in C-State : C6 and C7 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", + "PublicDescription": "Number of cores in C-State : C6 and C7 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" }, { -- cgit v1.2.3 From bc4e41210e337d5df3ecd8a9a07cfd6f2d63815b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:38:02 -0700 Subject: perf vendor events intel: Update tigerlake events/metrics Update tigerlake events to v1.12 including the new events MEM_LOAD_MISC_RETIRED.UC and SQ_MISC.BUS_LOCK. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../perf/pmu-events/arch/x86/tigerlake/cache.json | 18 + .../pmu-events/arch/x86/tigerlake/pipeline.json | 1 + .../pmu-events/arch/x86/tigerlake/tgl-metrics.json | 970 ++++++++++----------- 4 files changed, 505 insertions(+), 486 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 4a1a2b8d6201..6543a68d4a17 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -29,7 +29,7 @@ GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v56,skylake,core GenuineIntel-6-55-[01234],v1.30,skylakex,core GenuineIntel-6-86,v1.21,snowridgex,core -GenuineIntel-6-8[CD],v1.10,tigerlake,core +GenuineIntel-6-8[CD],v1.12,tigerlake,core GenuineIntel-6-2C,v4,westmereep-dp,core GenuineIntel-6-25,v3,westmereep-sp,core GenuineIntel-6-2F,v3,westmereex,core diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/cache.json b/tools/perf/pmu-events/arch/x86/tigerlake/cache.json index 738249a6f488..c54fb65d3259 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/cache.json @@ -322,6 +322,16 @@ "SampleAfterValue": "20011", "UMask": "0x2" }, + { + "BriefDescription": "Retired instructions with at least 1 uncacheable load or lock.", + "Data_LA": "1", + "EventCode": "0xd4", + "EventName": "MEM_LOAD_MISC_RETIRED.UC", + "PEBS": "1", + "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access", + "SampleAfterValue": "100007", + "UMask": "0x4" + }, { "BriefDescription": "Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.", "Data_LA": "1", @@ -510,6 +520,14 @@ "SampleAfterValue": "1000003", "UMask": "0x4" }, + { + "BriefDescription": "Counts bus locks, accounts for cache line split locks and UC locks.", + "EventCode": "0xf4", + "EventName": "SQ_MISC.BUS_LOCK", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", + "SampleAfterValue": "100003", + "UMask": "0x10" + }, { "BriefDescription": "Cycles the superQ cannot take any more entries.", "EventCode": "0xf4", diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json b/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json index a0aeeb801fd7..020801cbd7e3 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json @@ -395,6 +395,7 @@ { "BriefDescription": "Clears speculative count", "CounterMask": "1", + "EdgeDetect": "1", "EventCode": "0x0d", "EventName": "INT_MISC.CLEARS_COUNT", "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json index ae62bacf9f5e..d0538a754288 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json @@ -79,7 +79,7 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -88,7 +88,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -96,7 +96,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * ASSISTS.ANY / tma_info_slots", + "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -105,7 +105,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -125,7 +125,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", - "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_branch_instructions", "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", @@ -138,12 +138,12 @@ "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -161,7 +161,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -171,7 +171,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(49 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(49 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -191,7 +191,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "48 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "48 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -200,16 +200,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -219,7 +219,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -228,43 +228,43 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "54 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "54 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -273,11 +273,11 @@ }, { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -285,14 +285,14 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_slots", + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -321,7 +321,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -330,7 +330,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -339,7 +339,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -348,7 +348,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -357,7 +357,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_512b", "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -366,7 +366,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -386,7 +386,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_clks", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -394,696 +394,696 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" + }, + { + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { - "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", + "MetricGroup": "Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", + "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", + "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", + "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" }, { "BriefDescription": "Fraction of branches that are non-taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" + "MetricName": "tma_info_branches_cond_nt" }, { "BriefDescription": "Fraction of branches that are taken conditionals", "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "MetricName": "tma_info_branches_cond_tk" }, { - "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", - "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" + }, + { + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches" }, { "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" + "MetricName": "tma_info_core_core_clks" }, { "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" - }, - { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" + "MetricName": "tma_info_core_coreipc" }, { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" }, { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." }, { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" }, { "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 5 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", - "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 5 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" + "MetricName": "tma_info_frontend_dsb_switch_cost" }, { "BriefDescription": "Average number of Uops issued by front-end when it issued something", "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" + "MetricName": "tma_info_frontend_fetch_upc" }, { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" }, { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" }, { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" }, { - "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", - "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", - "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", + "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", + "MetricGroup": "Fed;LSD", + "MetricName": "tma_info_frontend_lsd_coverage" }, { - "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", - "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" }, { "BriefDescription": "Total number of retired Instructions", "MetricExpr": "INST_RETIRED.ANY", "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", + "MetricName": "tma_info_inst_mix_instructions", "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." }, { "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" }, { "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" }, { "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_ntaken", - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_taken", - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_ret", - "MetricThreshold": "tma_info_ipmisp_ret < 500" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" }, { "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" }, { "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" }, { "BriefDescription": "Instruction per taken branch", "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 11", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 11", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" }, { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" }, { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" }, { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" + "MetricName": "tma_info_memory_l1mpki" }, { "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" + "MetricName": "tma_info_memory_l1mpki_load" }, { "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" + "MetricName": "tma_info_memory_l2hpki_all" }, { "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" + "MetricName": "tma_info_memory_l2hpki_load" }, { "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" + "MetricName": "tma_info_memory_l2mpki" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" + "MetricName": "tma_info_memory_l2mpki_all" }, { "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" + "MetricName": "tma_info_memory_l2mpki_load" }, { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" }, { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" }, { "BriefDescription": "Average Latency for L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" }, { "BriefDescription": "Average Parallel L2 cache miss demand Loads", "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" + "MetricName": "tma_info_memory_oro_load_l2_mlp" }, { "BriefDescription": "Average Latency for L3 cache miss demand Loads", "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l3_miss_latency" + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" }, { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" }, { "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" }, { - "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", - "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", - "MetricGroup": "Fed;LSD", - "MetricName": "tma_info_lsd_coverage" + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" }, { - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / arb@event\\=0x81\\,umask\\=0x1@", - "MetricGroup": "Mem;SoC", - "MetricName": "tma_info_mem_request_latency" + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" }, { - "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", - "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" }, { - "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", - "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", - "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" }, { - "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", - "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", - "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" }, { - "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", - "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." }, { - "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", - "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", - "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", - "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" }, { - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_other_branches" + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" }, { - "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_clks)", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + }, + { + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", + "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / arb@event\\=0x81\\,umask\\=0x1@", + "MetricGroup": "Mem;SoC", + "MetricName": "tma_info_system_mem_request_latency" }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license0_utilization", + "MetricName": "tma_info_system_power_license0_utilization", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license1_utilization", - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", + "MetricName": "tma_info_system_power_license1_utilization", + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." }, { "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks", + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks", "MetricGroup": "Power", - "MetricName": "tma_info_power_license2_utilization", - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", + "MetricName": "tma_info_system_power_license2_utilization", + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." }, { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", + "MetricGroup": "SMT", + "MetricName": "tma_info_system_smt_2t_utilization" }, { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization" }, { - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", - "MetricGroup": "SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_slots_utilization" + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" }, { - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", - "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" }, { - "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." }, { - "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", - "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "TOPDOWN.SLOTS", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization" }, { "BriefDescription": "Uops Per Instruction", - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 7.5" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 7.5" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1092,7 +1092,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1102,7 +1102,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1111,7 +1111,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1120,20 +1120,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "17.5 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "17.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1148,7 +1148,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_clks)", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1165,7 +1165,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1174,7 +1174,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1183,10 +1183,10 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", - "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", "ScaleUnit": "100%" }, @@ -1202,20 +1202,20 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1239,7 +1239,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "tma_retiring * tma_info_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", + "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1248,28 +1248,28 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", - "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_clks", + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", "MetricName": "tma_mite_4wide", - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", "ScaleUnit": "100%" }, { @@ -1283,7 +1283,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1292,7 +1292,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -1311,7 +1311,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1320,7 +1320,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1329,7 +1329,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_5", "MetricThreshold": "tma_port_5 > 0.6", @@ -1338,7 +1338,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1347,7 +1347,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1356,7 +1356,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1365,7 +1365,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1374,7 +1374,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1383,7 +1383,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1392,7 +1392,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1402,7 +1402,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1411,7 +1411,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_clks", + "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1420,7 +1420,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1429,7 +1429,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1438,16 +1438,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1456,7 +1456,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1465,7 +1465,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1474,7 +1474,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1491,7 +1491,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1499,7 +1499,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1508,7 +1508,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "10 * BACLEARS.ANY / tma_info_clks", + "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", -- cgit v1.2.3 From 2b72cec9eef19d73c2a4a3e603004fdf2d93d9e6 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:41 +0530 Subject: perf: Extract building cache level for a CPU into separate function build_caches() builds the complete cache topology of the system by iterating over all CPU, building and comparing cache levels of each CPU, keeping only the unique ones at the end. Extract the unit that build the cache levels for a single CPU into a separate function. Expose this function, and the MAX_CACHE_LVL value to be used elsewhere in perf too. Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Gautham Shenoy Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-2-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 62 ++++++++++++++++++++++++++++++------------------ tools/perf/util/header.h | 4 ++++ 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 276870221ce0..560871736764 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1213,38 +1213,54 @@ static void cpu_cache_level__fprintf(FILE *out, struct cpu_cache_level *c) fprintf(out, "L%d %-15s %8s [%s]\n", c->level, c->type, c->size, c->map); } -#define MAX_CACHE_LVL 4 - -static int build_caches(struct cpu_cache_level caches[], u32 *cntp) +/* + * Build caches levels for a particular CPU from the data in + * /sys/devices/system/cpu/cpu/cache/ + * The cache level data is stored in caches[] from index at + * *cntp. + */ +int build_caches_for_cpu(u32 cpu, struct cpu_cache_level caches[], u32 *cntp) { - u32 i, cnt = 0; - u32 nr, cpu; u16 level; - nr = cpu__max_cpu().cpu; + for (level = 0; level < MAX_CACHE_LVL; level++) { + struct cpu_cache_level c; + int err; + u32 i; - for (cpu = 0; cpu < nr; cpu++) { - for (level = 0; level < MAX_CACHE_LVL; level++) { - struct cpu_cache_level c; - int err; + err = cpu_cache_level__read(&c, cpu, level); + if (err < 0) + return err; - err = cpu_cache_level__read(&c, cpu, level); - if (err < 0) - return err; + if (err == 1) + break; - if (err == 1) + for (i = 0; i < *cntp; i++) { + if (cpu_cache_level__cmp(&c, &caches[i])) break; + } - for (i = 0; i < cnt; i++) { - if (cpu_cache_level__cmp(&c, &caches[i])) - break; - } + if (i == *cntp) { + caches[*cntp] = c; + *cntp = *cntp + 1; + } else + cpu_cache_level__free(&c); + } - if (i == cnt) - caches[cnt++] = c; - else - cpu_cache_level__free(&c); - } + return 0; +} + +static int build_caches(struct cpu_cache_level caches[], u32 *cntp) +{ + u32 nr, cpu, cnt = 0; + + nr = cpu__max_cpu().cpu; + + for (cpu = 0; cpu < nr; cpu++) { + int ret = build_caches_for_cpu(cpu, caches, &cnt); + + if (ret) + return ret; } *cntp = cnt; return 0; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 59eeb4a32ac5..7c16a250e738 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -179,7 +179,11 @@ int do_write(struct feat_fd *fd, const void *buf, size_t size); int write_padded(struct feat_fd *fd, const void *bf, size_t count, size_t count_aligned); +#define MAX_CACHE_LVL 4 + int is_cpu_online(unsigned int cpu); +int build_caches_for_cpu(u32 cpu, struct cpu_cache_level caches[], u32 *cntp); + /* * arch specific callback */ -- cgit v1.2.3 From 995ed074b829f293586028560f2f27f47889df64 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:42 +0530 Subject: perf stat: Setup the foundation to allow aggregation based on cache topology Processors based on chiplet architecture, such as AMD EPYC and Hygon do not expose the chiplet details in the sysfs CPU topology information. However, this information can be derived from the per CPU cache level information from the sysfs. 'perf stat' has already supported aggregation based on topology information using core ID, socket ID, etc. It'll be useful to aggregate based on the cache topology to detect problems like imbalance and cache-to-cache sharing at various cache levels. This patch lays the foundation for aggregating data in 'perf stat' based on the processor's cache topology. The cmdline option to aggregate data based on the cache topology is added in Patch 4 of the series while this patch sets up all the necessary functions and variables required to support the new aggregation option. The patch also adds support to display per-cache aggregation, or save it as a JSON or CSV, as splitting it into a separate patch would break builds when compiling with "-Werror=switch-enum" where the compiler will complain about the lack of handling for the AGGR_CACHE case in the output functions. Committer notes: Don't use perf_stat_config in tools/perf/util/cpumap.c, this would make code that is in util/, thus not really specific to a single builtin, use a specific builtin config structure. Move the functions introduced in this patch from tools/perf/util/cpumap.c since it needs access to builtin specific and is not strictly needed to live in the util/ directory. With this 'perf test python' is back building. Suggested-by: Gautham Shenoy Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-3-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/cpumap.h | 5 + tools/perf/builtin-stat.c | 209 ++++++++++++++++++++++++++++++++++- tools/perf/util/cpumap.c | 10 ++ tools/perf/util/cpumap.h | 7 ++ tools/perf/util/stat-display.c | 17 +++ tools/perf/util/stat.h | 2 + 6 files changed, 249 insertions(+), 1 deletion(-) diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index 3f43f770cdac..8724dde79342 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -11,6 +11,11 @@ struct perf_cpu { int cpu; }; +struct perf_cache { + int cache_lvl; + int cache; +}; + struct perf_cpu_map; LIBPERF_API struct perf_cpu_map *perf_cpu_map__dummy_new(void); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index bc45cee3f77c..0528d1bc15d2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -142,6 +142,7 @@ struct perf_stat { struct perf_cpu_map *cpus; struct perf_thread_map *threads; enum aggr_mode aggr_mode; + u32 aggr_level; }; static struct perf_stat perf_stat; @@ -151,6 +152,7 @@ static volatile sig_atomic_t done = 0; static struct perf_stat_config stat_config = { .aggr_mode = AGGR_GLOBAL, + .aggr_level = MAX_CACHE_LVL + 1, .scale = true, .unit_width = 4, /* strlen("unit") */ .run_count = 1, @@ -1249,8 +1251,132 @@ static struct option stat_options[] = { OPT_END() }; +/** + * Calculate the cache instance ID from the map in + * /sys/devices/system/cpu/cpuX/cache/indexY/shared_cpu_list + * Cache instance ID is the first CPU reported in the shared_cpu_list file. + */ +static int cpu__get_cache_id_from_map(struct perf_cpu cpu, char *map) +{ + int id; + struct perf_cpu_map *cpu_map = perf_cpu_map__new(map); + + /* + * If the map contains no CPU, consider the current CPU to + * be the first online CPU in the cache domain else use the + * first online CPU of the cache domain as the ID. + */ + if (perf_cpu_map__empty(cpu_map)) + id = cpu.cpu; + else + id = perf_cpu_map__cpu(cpu_map, 0).cpu; + + /* Free the perf_cpu_map used to find the cache ID */ + perf_cpu_map__put(cpu_map); + + return id; +} + +/** + * cpu__get_cache_id - Returns 0 if successful in populating the + * cache level and cache id. Cache level is read from + * /sys/devices/system/cpu/cpuX/cache/indexY/level where as cache instance ID + * is the first CPU reported by + * /sys/devices/system/cpu/cpuX/cache/indexY/shared_cpu_list + */ +static int cpu__get_cache_details(struct perf_cpu cpu, struct perf_cache *cache) +{ + int ret = 0; + u32 cache_level = stat_config.aggr_level; + struct cpu_cache_level caches[MAX_CACHE_LVL]; + u32 i = 0, caches_cnt = 0; + + cache->cache_lvl = (cache_level > MAX_CACHE_LVL) ? 0 : cache_level; + cache->cache = -1; + + ret = build_caches_for_cpu(cpu.cpu, caches, &caches_cnt); + if (ret) { + /* + * If caches_cnt is not 0, cpu_cache_level data + * was allocated when building the topology. + * Free the allocated data before returning. + */ + if (caches_cnt) + goto free_caches; + + return ret; + } + + if (!caches_cnt) + return -1; + + /* + * Save the data for the highest level if no + * level was specified by the user. + */ + if (cache_level > MAX_CACHE_LVL) { + int max_level_index = 0; + + for (i = 1; i < caches_cnt; ++i) { + if (caches[i].level > caches[max_level_index].level) + max_level_index = i; + } + + cache->cache_lvl = caches[max_level_index].level; + cache->cache = cpu__get_cache_id_from_map(cpu, caches[max_level_index].map); + + /* Reset i to 0 to free entire caches[] */ + i = 0; + goto free_caches; + } + + for (i = 0; i < caches_cnt; ++i) { + if (caches[i].level == cache_level) { + cache->cache_lvl = cache_level; + cache->cache = cpu__get_cache_id_from_map(cpu, caches[i].map); + } + + cpu_cache_level__free(&caches[i]); + } + +free_caches: + /* + * Free all the allocated cpu_cache_level data. + */ + while (i < caches_cnt) + cpu_cache_level__free(&caches[i++]); + + return ret; +} + +/** + * aggr_cpu_id__cache - Create an aggr_cpu_id with cache instache ID, cache + * level, die and socket populated with the cache instache ID, cache level, + * die and socket for cpu. The function signature is compatible with + * aggr_cpu_id_get_t. + */ +static struct aggr_cpu_id aggr_cpu_id__cache(struct perf_cpu cpu, void *data) +{ + int ret; + struct aggr_cpu_id id; + struct perf_cache cache; + + id = aggr_cpu_id__die(cpu, data); + if (aggr_cpu_id__is_empty(&id)) + return id; + + ret = cpu__get_cache_details(cpu, &cache); + if (ret) + return id; + + id.cache_lvl = cache.cache_lvl; + id.cache = cache.cache; + return id; +} + static const char *const aggr_mode__string[] = { [AGGR_CORE] = "core", + [AGGR_CACHE] = "cache", [AGGR_DIE] = "die", [AGGR_GLOBAL] = "global", [AGGR_NODE] = "node", @@ -1272,6 +1398,12 @@ static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __m return aggr_cpu_id__die(cpu, /*data=*/NULL); } +static struct aggr_cpu_id perf_stat__get_cache_id(struct perf_stat_config *config __maybe_unused, + struct perf_cpu cpu) +{ + return aggr_cpu_id__cache(cpu, /*data=*/NULL); +} + static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused, struct perf_cpu cpu) { @@ -1324,6 +1456,12 @@ static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *con return perf_stat__get_aggr(config, perf_stat__get_die, cpu); } +static struct aggr_cpu_id perf_stat__get_cache_id_cached(struct perf_stat_config *config, + struct perf_cpu cpu) +{ + return perf_stat__get_aggr(config, perf_stat__get_cache_id, cpu); +} + static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config, struct perf_cpu cpu) { @@ -1355,6 +1493,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode) return aggr_cpu_id__socket; case AGGR_DIE: return aggr_cpu_id__die; + case AGGR_CACHE: + return aggr_cpu_id__cache; case AGGR_CORE: return aggr_cpu_id__core; case AGGR_NODE: @@ -1378,6 +1518,8 @@ static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode) return perf_stat__get_socket_cached; case AGGR_DIE: return perf_stat__get_die_cached; + case AGGR_CACHE: + return perf_stat__get_cache_id_cached; case AGGR_CORE: return perf_stat__get_core_cached; case AGGR_NODE: @@ -1490,6 +1632,60 @@ static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(struct perf_cpu cpu, voi return id; } +static void perf_env__get_cache_id_for_cpu(struct perf_cpu cpu, struct perf_env *env, + u32 cache_level, struct aggr_cpu_id *id) +{ + int i; + int caches_cnt = env->caches_cnt; + struct cpu_cache_level *caches = env->caches; + + id->cache_lvl = (cache_level > MAX_CACHE_LVL) ? 0 : cache_level; + id->cache = -1; + + if (!caches_cnt) + return; + + for (i = caches_cnt - 1; i > -1; --i) { + struct perf_cpu_map *cpu_map; + int map_contains_cpu; + + /* + * If user has not specified a level, find the fist level with + * the cpu in the map. Since building the map is expensive, do + * this only if levels match. + */ + if (cache_level <= MAX_CACHE_LVL && caches[i].level != cache_level) + continue; + + cpu_map = perf_cpu_map__new(caches[i].map); + map_contains_cpu = perf_cpu_map__idx(cpu_map, cpu); + perf_cpu_map__put(cpu_map); + + if (map_contains_cpu != -1) { + id->cache_lvl = caches[i].level; + id->cache = cpu__get_cache_id_from_map(cpu, caches[i].map); + return; + } + } +} + +static struct aggr_cpu_id perf_env__get_cache_aggr_by_cpu(struct perf_cpu cpu, + void *data) +{ + struct perf_env *env = data; + struct aggr_cpu_id id = aggr_cpu_id__empty(); + + if (cpu.cpu != -1) { + u32 cache_level = (perf_stat.aggr_level) ?: stat_config.aggr_level; + + id.socket = env->cpu[cpu.cpu].socket_id; + id.die = env->cpu[cpu.cpu].die_id; + perf_env__get_cache_id_for_cpu(cpu, env, cache_level, &id); + } + + return id; +} + static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data) { struct perf_env *env = data; @@ -1558,6 +1754,12 @@ static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *confi return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env); } +static struct aggr_cpu_id perf_stat__get_cache_file(struct perf_stat_config *config __maybe_unused, + struct perf_cpu cpu) +{ + return perf_env__get_cache_aggr_by_cpu(cpu, &perf_stat.session->header.env); +} + static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused, struct perf_cpu cpu) { @@ -1589,6 +1791,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode) return perf_env__get_socket_aggr_by_cpu; case AGGR_DIE: return perf_env__get_die_aggr_by_cpu; + case AGGR_CACHE: + return perf_env__get_cache_aggr_by_cpu; case AGGR_CORE: return perf_env__get_core_aggr_by_cpu; case AGGR_NODE: @@ -1612,6 +1816,8 @@ static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode) return perf_stat__get_socket_file; case AGGR_DIE: return perf_stat__get_die_file; + case AGGR_CACHE: + return perf_stat__get_cache_file; case AGGR_CORE: return perf_stat__get_core_file; case AGGR_NODE: @@ -2127,7 +2333,8 @@ static struct perf_stat perf_stat = { .stat = perf_event__process_stat_event, .stat_round = process_stat_round_event, }, - .aggr_mode = AGGR_UNSET, + .aggr_mode = AGGR_UNSET, + .aggr_level = 0, }; static int __cmd_report(int argc, const char **argv) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 75d9c73e0184..a0719816a218 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -222,6 +222,10 @@ static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer) return a->socket - b->socket; else if (a->die != b->die) return a->die - b->die; + else if (a->cache_lvl != b->cache_lvl) + return a->cache_lvl - b->cache_lvl; + else if (a->cache != b->cache) + return a->cache - b->cache; else if (a->core != b->core) return a->core - b->core; else @@ -679,6 +683,8 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b a->node == b->node && a->socket == b->socket && a->die == b->die && + a->cache_lvl == b->cache_lvl && + a->cache == b->cache && a->core == b->core && a->cpu.cpu == b->cpu.cpu; } @@ -689,6 +695,8 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a) a->node == -1 && a->socket == -1 && a->die == -1 && + a->cache_lvl == -1 && + a->cache == -1 && a->core == -1 && a->cpu.cpu == -1; } @@ -700,6 +708,8 @@ struct aggr_cpu_id aggr_cpu_id__empty(void) .node = -1, .socket = -1, .die = -1, + .cache_lvl = -1, + .cache = -1, .core = -1, .cpu = (struct perf_cpu){ .cpu = -1 }, }; diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index e3426541e0aa..f394ccc0ccfb 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -20,6 +20,13 @@ struct aggr_cpu_id { int socket; /** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */ int die; + /** The cache level as read from /sys/devices/system/cpu/cpuX/cache/indexY/level */ + int cache_lvl; + /** + * The cache instance ID, which is the first CPU in the + * /sys/devices/system/cpu/cpuX/cache/indexY/shared_cpu_list + */ + int cache; /** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */ int core; /** CPU aggregation, note there is one CPU for each SMT thread. */ diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index bf5a6c14dfcd..319f456f0673 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -36,6 +36,7 @@ static int aggr_header_lens[] = { [AGGR_CORE] = 18, + [AGGR_CACHE] = 22, [AGGR_DIE] = 12, [AGGR_SOCKET] = 6, [AGGR_NODE] = 6, @@ -46,6 +47,7 @@ static int aggr_header_lens[] = { static const char *aggr_header_csv[] = { [AGGR_CORE] = "core,cpus,", + [AGGR_CACHE] = "cache,cpus,", [AGGR_DIE] = "die,cpus,", [AGGR_SOCKET] = "socket,cpus,", [AGGR_NONE] = "cpu,", @@ -56,6 +58,7 @@ static const char *aggr_header_csv[] = { static const char *aggr_header_std[] = { [AGGR_CORE] = "core", + [AGGR_CACHE] = "cache", [AGGR_DIE] = "die", [AGGR_SOCKET] = "socket", [AGGR_NONE] = "cpu", @@ -193,6 +196,10 @@ static void print_aggr_id_std(struct perf_stat_config *config, case AGGR_CORE: snprintf(buf, sizeof(buf), "S%d-D%d-C%d", id.socket, id.die, id.core); break; + case AGGR_CACHE: + snprintf(buf, sizeof(buf), "S%d-D%d-L%d-ID%d", + id.socket, id.die, id.cache_lvl, id.cache); + break; case AGGR_DIE: snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die); break; @@ -239,6 +246,10 @@ static void print_aggr_id_csv(struct perf_stat_config *config, fprintf(output, "S%d-D%d-C%d%s%d%s", id.socket, id.die, id.core, sep, aggr_nr, sep); break; + case AGGR_CACHE: + fprintf(config->output, "S%d-D%d-L%d-ID%d%s%d%s", + id.socket, id.die, id.cache_lvl, id.cache, sep, aggr_nr, sep); + break; case AGGR_DIE: fprintf(output, "S%d-D%d%s%d%s", id.socket, id.die, sep, aggr_nr, sep); @@ -284,6 +295,10 @@ static void print_aggr_id_json(struct perf_stat_config *config, fprintf(output, "\"core\" : \"S%d-D%d-C%d\", \"aggregate-number\" : %d, ", id.socket, id.die, id.core, aggr_nr); break; + case AGGR_CACHE: + fprintf(output, "\"cache\" : \"S%d-D%d-L%d-ID%d\", \"aggregate-number\" : %d, ", + id.socket, id.die, id.cache_lvl, id.cache, aggr_nr); + break; case AGGR_DIE: fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ", id.socket, id.die, aggr_nr); @@ -1125,6 +1140,7 @@ static void print_header_interval_std(struct perf_stat_config *config, case AGGR_NODE: case AGGR_SOCKET: case AGGR_DIE: + case AGGR_CACHE: case AGGR_CORE: fprintf(output, "#%*s %-*s cpus", INTERVAL_LEN - 1, "time", @@ -1425,6 +1441,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf switch (config->aggr_mode) { case AGGR_CORE: + case AGGR_CACHE: case AGGR_DIE: case AGGR_SOCKET: case AGGR_NODE: diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index e35e188237c8..7abff7cbb5a1 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -48,6 +48,7 @@ enum aggr_mode { AGGR_GLOBAL, AGGR_SOCKET, AGGR_DIE, + AGGR_CACHE, AGGR_CORE, AGGR_THREAD, AGGR_UNSET, @@ -64,6 +65,7 @@ typedef struct aggr_cpu_id (*aggr_get_id_t)(struct perf_stat_config *config, str struct perf_stat_config { enum aggr_mode aggr_mode; + u32 aggr_level; bool scale; bool no_inherit; bool identifier; -- cgit v1.2.3 From 4b87406a3b590888edf02705a815eb62e122e9ba Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:43 +0530 Subject: perf stat record: Save cache level information When aggregating based on cache-topology, in addition to the aggregation mode, knowing the cache level at which data is aggregated is necessary to ensure consistency when running 'perf stat record' and later 'perf stat report'. Save the cache level for aggregation as a part of the env data that can be later retrieved when running perf stat report. Suggested-by: Gautham Shenoy Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-4-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/event.h | 3 ++- tools/perf/util/event.c | 7 ++++--- tools/perf/util/synthetic-events.c | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 51b9338f4c11..ba2dcf64f4e6 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -380,7 +380,8 @@ enum { PERF_STAT_CONFIG_TERM__AGGR_MODE = 0, PERF_STAT_CONFIG_TERM__INTERVAL = 1, PERF_STAT_CONFIG_TERM__SCALE = 2, - PERF_STAT_CONFIG_TERM__MAX = 3, + PERF_STAT_CONFIG_TERM__AGGR_LEVEL = 3, + PERF_STAT_CONFIG_TERM__MAX = 4, }; struct perf_record_stat_config_entry { diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 8ae742e32e3c..e8b0666d913c 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -135,9 +135,10 @@ void perf_event__read_stat_config(struct perf_stat_config *config, config->__val = event->data[i].val; \ break; - CASE(AGGR_MODE, aggr_mode) - CASE(SCALE, scale) - CASE(INTERVAL, interval) + CASE(AGGR_MODE, aggr_mode) + CASE(SCALE, scale) + CASE(INTERVAL, interval) + CASE(AGGR_LEVEL, aggr_level) #undef CASE default: pr_warning("unknown stat config term %" PRI_lu64 "\n", diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b2e4afa5efa1..45714a2785fd 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1375,6 +1375,7 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool, ADD(AGGR_MODE, config->aggr_mode) ADD(INTERVAL, config->interval) ADD(SCALE, config->scale) + ADD(AGGR_LEVEL, config->aggr_level) WARN_ONCE(i != PERF_STAT_CONFIG_TERM__MAX, "stat config terms unbalanced\n"); -- cgit v1.2.3 From aab667ca8837e45fda0204bed7b59abd634c0b2b Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:44 +0530 Subject: perf stat: Add "--per-cache" aggregation option and document it This patch adds support for "--per-cache" option for aggregation at a particular cache level and documents the same. Following is the output of 'perf stat' with aggregation at L3 for the event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running hackbench pinned to 4 LLCs: $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L3-ID0 16 9,500,803 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 6,338,099 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 355,005 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 22,067 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 16,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 11,619 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 4,238 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 31,158 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 28,242,452 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 22,906,973 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 72,898 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 56,907 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 20,456 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 40,913 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 78,113 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 37,897 ls_dmnd_fills_from_sys.ext_cache_remote Also support 'perf stat record' and 'perf stat report' with the ability to specify a different cache level to aggregate data at when running 'perf stat report'. $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L2-ID0 2 1,442,061 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID1 2 1,548,994 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID2 2 1,553,557 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID3 2 1,420,122 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID4 2 1,465,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID5 2 1,455,153 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID6 2 1,595,237 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID7 2 1,499,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID8 2 1,919,025 ls_dmnd_fills_from_sys.ext_cache_remote ... S1-D1-L2-ID127 2 21,295 ls_dmnd_fills_from_sys.ext_cache_remote $ sudo perf stat report --per-cache=L3 Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8': S0-D0-L3-ID0 16 11,979,906 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 14,257,202 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 377,484 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 27,224 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 26,816 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 14,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 10,499 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 53,817 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 27,361,987 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 37,299,024 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 84,125 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 64,561 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 13,403 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 20,138 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 93,220 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 35,465 ls_dmnd_fills_from_sys.ext_cache_remote On the above system, the domain covered by S0-D0-L3-ID0 contains S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is equal to the sum of counts for L2-ID0 to L2-ID7. Add documentation for the newly introduced "--per-cache" option. Suggested-by: Gautham Shenoy Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 16 ++++++++++ tools/perf/builtin-stat.c | 56 ++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 29bdcfa93f04..785f0e2bcfac 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide). The output includes the die number and the number of online processors on that die. This is useful to gauge the amount of aggregation. +--per-cache:: +Aggregate counts per cache instance for system-wide mode measurements. By +default, the aggregation happens for the cache level at the highest index +in the system. To specify a particular level, mention the cache level +alongside the option in the format [Ll][1-9][0-9]*. For example: +Using option "--per-cache=l3" or "--per-cache=L3" will aggregate the +information at the boundary of the level 3 cache in the system. + --per-core:: Aggregate counts per physical processor for system-wide mode measurements. This is a useful mode to detect imbalance between physical cores. To enable this mode, @@ -379,6 +387,14 @@ Aggregate counts per processor socket for system-wide mode measurements. --per-die:: Aggregate counts per processor die for system-wide mode measurements. +--per-cache:: +Aggregate counts per cache instance for system-wide mode measurements. By +default, the aggregation happens for the cache level at the highest index +in the system. To specify a particular level, mention the cache level +alongside the option in the format [Ll][1-9][0-9]*. For example: Using +option "--per-cache=l3" or "--per-cache=L3" will aggregate the +information at the boundary of the level 3 cache in the system. + --per-core:: Aggregate counts per physical processor for system-wide mode measurements. diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0528d1bc15d2..176deeb8ee66 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt, return 0; } +static int parse_cache_level(const struct option *opt, + const char *str, + int unset __maybe_unused) +{ + int level; + u32 *aggr_mode = (u32 *)opt->value; + u32 *aggr_level = (u32 *)opt->data; + + /* + * If no string is specified, aggregate based on the topology of + * Last Level Cache (LLC). Since the LLC level can change from + * architecture to architecture, set level greater than + * MAX_CACHE_LVL which will be interpreted as LLC. + */ + if (str == NULL) { + level = MAX_CACHE_LVL + 1; + goto out; + } + + /* + * The format to specify cache level is LX or lX where X is the + * cache level. + */ + if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) { + pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n", + MAX_CACHE_LVL, + MAX_CACHE_LVL); + return -EINVAL; + } + + level = atoi(&str[1]); + if (level < 1) { + pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n", + MAX_CACHE_LVL, + MAX_CACHE_LVL); + return -EINVAL; + } + + if (level > MAX_CACHE_LVL) { + pr_err("perf only supports max cache level of %d.\n" + "Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL); + return -EINVAL; + } +out: + *aggr_mode = AGGR_CACHE; + *aggr_level = level; + return 0; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1190,6 +1239,9 @@ static struct option stat_options[] = { "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode, "aggregate counts per processor die", AGGR_DIE), + OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level, + "cache level", "aggregate count at this cache level (Default: LLC)", + parse_cache_level), OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode, @@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv) "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode, "aggregate counts per processor die", AGGR_DIE), + OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level, + "cache level", + "aggregate count at this cache level (Default: LLC)", + parse_cache_level), OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode, -- cgit v1.2.3 From bfce728db31790420758de3173c3e7185ba57cb1 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:45 +0530 Subject: pert tests: Add tests for new "perf stat --per-cache" aggregation option Add tests for the new "--per-cache" option in 'perf stat' for CSV and JSON generation as well as for the JSON linting. Suggested-by: Gautham Shenoy Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-6-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/perf_json_output_lint.py | 4 +++- tools/perf/tests/shell/stat+csv_output.sh | 14 ++++++++++++++ tools/perf/tests/shell/stat+json_output.sh | 13 +++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py index 61f3059ca54b..4acaaed5560d 100644 --- a/tools/perf/tests/shell/lib/perf_json_output_lint.py +++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py @@ -14,6 +14,7 @@ ap.add_argument('--system-wide', action='store_true') ap.add_argument('--event', action='store_true') ap.add_argument('--per-core', action='store_true') ap.add_argument('--per-thread', action='store_true') +ap.add_argument('--per-cache', action='store_true') ap.add_argument('--per-die', action='store_true') ap.add_argument('--per-node', action='store_true') ap.add_argument('--per-socket', action='store_true') @@ -47,6 +48,7 @@ def check_json_output(expected_items): 'counter-value': lambda x: is_counter_value(x), 'cgroup': lambda x: True, 'cpu': lambda x: isint(x), + 'cache': lambda x: True, 'die': lambda x: True, 'event': lambda x: True, 'event-runtime': lambda x: isfloat(x), @@ -83,7 +85,7 @@ try: expected_items = 7 elif args.interval or args.per_thread or args.system_wide_no_aggr: expected_items = 8 - elif args.per_core or args.per_socket or args.per_node or args.per_die: + elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache_instance: expected_items = 9 else: # If no option is specified, don't check the number of items. diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh index fb78b6251a4e..a1969f236a0a 100755 --- a/tools/perf/tests/shell/stat+csv_output.sh +++ b/tools/perf/tests/shell/stat+csv_output.sh @@ -40,6 +40,7 @@ function commachecker() ;; "--per-socket") exp=8 ;; "--per-node") exp=8 ;; "--per-die") exp=8 + ;; "--per-cache") exp=8 esac while read line @@ -145,6 +146,18 @@ check_per_thread() echo "[Success]" } +check_per_cache_instance() +{ + echo -n "Checking CSV output: per cache instance " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat -x$csv_sep --per-cache -a true 2>&1 | commachecker --per-cache + echo "[Success]" +} + check_per_die() { echo -n "Checking CSV output: per die " @@ -222,6 +235,7 @@ if [ $skip_test -ne 1 ] then check_system_wide_no_aggr check_per_core + check_per_cache_instance check_per_die check_per_socket else diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh index f3e4967cc72e..c282afa6217c 100755 --- a/tools/perf/tests/shell/stat+json_output.sh +++ b/tools/perf/tests/shell/stat+json_output.sh @@ -120,6 +120,18 @@ check_per_thread() echo "[Success]" } +check_per_cache_instance() +{ + echo -n "Checking json output: per cache_instance " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoia and not root" + return + fi + perf stat -j --per-cache -a true 2>&1 | $PYTHON $pythonchecker --per-cache + echo "[Success]" +} + check_per_die() { echo -n "Checking json output: per die " @@ -197,6 +209,7 @@ if [ $skip_test -ne 1 ] then check_system_wide_no_aggr check_per_core + check_per_cache_instance check_per_die check_per_socket else -- cgit v1.2.3 From 66c6e0c1002771754c27ea5e14eeaa1405e3d088 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:38:03 -0700 Subject: perf jevents: Add support for metricgroup descriptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Metrics have a field where the groups they belong to are listed like the following from tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json: "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", The metric groups are shown in 'perf list' like the following where TopdownL1 is a metric group: TopdownL1: tma_backend_bound [This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend] tma_bad_speculation [This category represents fraction of slots wasted due to incorrect speculations] tma_frontend_bound [This category represents fraction of slots where the processor's Frontend undersupplies its Backend] tma_retiring [This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired] This patch adds support for a new json file in each model directory called metricgroups.json that comprises a dictionary containing entries that map from a metric group to a description: { ... "TopdownL1": "Metrics for top-down breakdown at level 1", ... } perf list is then updated to support this changing the above output to: TopdownL1: [Metrics for top-down breakdown at level 1] Committer notes: Added a (int) cast to the ARRAY_SIZE() introduced in this patch to address: /tmp/build/perf-tools-next/pmu-events/pmu-events.c: In function ‘describe_metricgroup’: /var/home/acme/git/perf-tools-next/tools/include/linux/kernel.h:102:25: error: overflow in conversion from ‘long unsigned int’ to ‘int’ changes value from ‘18446744073709551615’ to ‘-1’ [-Werror=overflow] 102 | #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) | ^ /tmp/build/perf-tools-next/pmu-events/pmu-events.c:61603:29: note: in expansion of macro ‘ARRAY_SIZE’ 61603 | int low = 0, high = ARRAY_SIZE(metricgroups) - 1; | ^~~~~~~~~~ cc1: all warnings being treated as errors Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 11 +++++-- tools/perf/pmu-events/empty-pmu-events.c | 5 ++++ tools/perf/pmu-events/jevents.py | 49 ++++++++++++++++++++++++++++++-- tools/perf/pmu-events/pmu-events.h | 2 ++ 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index c6bd0aa4a56e..e8520a027b45 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -192,9 +192,14 @@ static void default_print_metric(void *ps, if (group && print_state->metricgroups) { if (print_state->name_only) printf("%s ", group); - else if (print_state->metrics) - printf("\n%s:\n", group); - else + else if (print_state->metrics) { + const char *gdesc = describe_metricgroup(group); + + if (gdesc) + printf("\n%s: [%s]\n", group, gdesc); + else + printf("\n%s:\n", group); + } else printf("%s\n", group); } zfree(&print_state->last_metricgroups); diff --git a/tools/perf/pmu-events/empty-pmu-events.c b/tools/perf/pmu-events/empty-pmu-events.c index e74defb5284f..a630c617e879 100644 --- a/tools/perf/pmu-events/empty-pmu-events.c +++ b/tools/perf/pmu-events/empty-pmu-events.c @@ -420,3 +420,8 @@ int pmu_for_each_sys_metric(pmu_metric_iter_fn fn __maybe_unused, void *data __m { return 0; } + +const char *describe_metricgroup(const char *group __maybe_unused) +{ + return NULL; +} diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 487ff01baf1b..7ed258be1829 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -37,6 +37,8 @@ _pending_metrics = [] _pending_metrics_tblname = None # Global BigCString shared by all structures. _bcs = None +# Map from the name of a metric group to a description of the group. +_metricgroups = {} # Order specific JsonEvent attributes will be visited. _json_event_attributes = [ # cmp_sevent related attributes. @@ -512,6 +514,17 @@ def preprocess_one_file(parents: Sequence[str], item: os.DirEntry) -> None: if not item.is_file() or not item.name.endswith('.json'): return + if item.name == 'metricgroups.json': + metricgroup_descriptions = json.load(open(item.path)) + for mgroup in metricgroup_descriptions: + assert len(mgroup) > 1, parents + description = f"{metricgroup_descriptions[mgroup]}\\000" + mgroup = f"{mgroup}\\000" + _bcs.add(mgroup) + _bcs.add(description) + _metricgroups[mgroup] = description + return + topic = get_topic(item.name) for event in read_json_events(item.path, topic): if event.name: @@ -548,7 +561,7 @@ def process_one_file(parents: Sequence[str], item: os.DirEntry) -> None: # Ignore other directories. If the file name does not have a .json # extension, ignore it. It could be a readme.txt for instance. - if not item.is_file() or not item.name.endswith('.json'): + if not item.is_file() or not item.name.endswith('.json') or item.name == 'metricgroups.json': return add_events_table_entries(item, get_topic(item.name)) @@ -911,6 +924,38 @@ int pmu_for_each_sys_metric(pmu_metric_iter_fn fn, void *data) } """) +def print_metricgroups() -> None: + _args.output_file.write(""" +static const int metricgroups[][2] = { +""") + for mgroup in sorted(_metricgroups): + description = _metricgroups[mgroup] + _args.output_file.write( + f'\t{{ {_bcs.offsets[mgroup]}, {_bcs.offsets[description]} }}, /* {mgroup} => {description} */\n' + ) + _args.output_file.write(""" +}; + +const char *describe_metricgroup(const char *group) +{ + int low = 0, high = (int)ARRAY_SIZE(metricgroups) - 1; + + while (low <= high) { + int mid = (low + high) / 2; + const char *mgroup = &big_c_string[metricgroups[mid][0]]; + int cmp = strcmp(mgroup, group); + + if (cmp == 0) { + return &big_c_string[metricgroups[mid][1]]; + } else if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + } + } + return NULL; +} +""") def main() -> None: global _args @@ -993,7 +1038,7 @@ struct compact_pmu_event { print_mapping_table(archs) print_system_mapping_table() - + print_metricgroups() if __name__ == '__main__': main() diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index 3549e6971a4d..8cd23d656a5d 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -93,4 +93,6 @@ const struct pmu_metrics_table *find_sys_metrics_table(const char *name); int pmu_for_each_sys_event(pmu_event_iter_fn fn, void *data); int pmu_for_each_sys_metric(pmu_metric_iter_fn fn, void *data); +const char *describe_metricgroup(const char *group); + #endif -- cgit v1.2.3 From 6ac2230b55d392e6294ea9f406619ed39fd9050f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:38:04 -0700 Subject: perf vendor events intel: Add metricgroup descriptions for all models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add metric group descriptions created by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py The descriptions add some additional detail in perf list. Committer notes: Removed unrelated changes to tools/perf/pmu-events/arch/x86/mapfile.csv that removed AMD mappings and ended up breaking the build with things like: CC /tmp/build/perf-tools-next/pmu-events/pmu-events.o /tmp/build/perf-tools-next/pmu-events/pmu-events.c:23808:39: error: ‘pmu_metrics__amdzen4’ defined but not used [-Werror=unused-const-variable=] 23808 | static const struct compact_pmu_event pmu_metrics__amdzen4[] = { | ^~~~~~~~~~~~~~~~~~~~ /tmp/build/perf-tools-next/pmu-events/pmu-events.c:23316:39: error: ‘pmu_events__amdzen4’ defined but not used [-Werror=unused-const-variable=] 23316 | static const struct compact_pmu_event pmu_events__amdzen4[] = { | ^~~~~~~~~~~~~~~~~~~ Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/alderlake/metricgroups.json | 122 +++++++++++++++++++++ .../arch/x86/alderlaken/metricgroups.json | 26 +++++ .../arch/x86/broadwell/metricgroups.json | 107 ++++++++++++++++++ .../arch/x86/broadwellde/metricgroups.json | 107 ++++++++++++++++++ .../arch/x86/broadwellx/metricgroups.json | 107 ++++++++++++++++++ .../arch/x86/cascadelakex/metricgroups.json | 114 +++++++++++++++++++ .../pmu-events/arch/x86/haswell/metricgroups.json | 107 ++++++++++++++++++ .../pmu-events/arch/x86/haswellx/metricgroups.json | 107 ++++++++++++++++++ .../pmu-events/arch/x86/icelake/metricgroups.json | 113 +++++++++++++++++++ .../pmu-events/arch/x86/icelakex/metricgroups.json | 114 +++++++++++++++++++ .../arch/x86/ivybridge/metricgroups.json | 107 ++++++++++++++++++ .../pmu-events/arch/x86/ivytown/metricgroups.json | 107 ++++++++++++++++++ .../pmu-events/arch/x86/jaketown/metricgroups.json | 100 +++++++++++++++++ .../arch/x86/sandybridge/metricgroups.json | 100 +++++++++++++++++ .../arch/x86/sapphirerapids/metricgroups.json | 118 ++++++++++++++++++++ .../pmu-events/arch/x86/skylake/metricgroups.json | 113 +++++++++++++++++++ .../pmu-events/arch/x86/skylakex/metricgroups.json | 114 +++++++++++++++++++ .../arch/x86/tigerlake/metricgroups.json | 113 +++++++++++++++++++ 18 files changed, 1896 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/alderlaken/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/haswell/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/icelake/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/skylake/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json diff --git a/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json new file mode 100644 index 000000000000..516eb0f93f02 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json @@ -0,0 +1,122 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IntVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", + "tma_backend_bound_aux_group": "Metrics contributing to tma_backend_bound_aux category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_base_group": "Metrics contributing to tma_base category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_int_operations_group": "Metrics contributing to tma_int_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_mem_scheduler_group": "Metrics contributing to tma_mem_scheduler category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_nuke_group": "Metrics contributing to tma_nuke category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_resource_bound_group": "Metrics contributing to tma_resource_bound category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/metricgroups.json b/tools/perf/pmu-events/arch/x86/alderlaken/metricgroups.json new file mode 100644 index 000000000000..7b2049cd2694 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/alderlaken/metricgroups.json @@ -0,0 +1,26 @@ +{ + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_backend_bound_aux_group": "Metrics contributing to tma_backend_bound_aux category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_base_group": "Metrics contributing to tma_base category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", + "tma_mem_scheduler_group": "Metrics contributing to tma_mem_scheduler category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_nuke_group": "Metrics contributing to tma_nuke category", + "tma_resource_bound_group": "Metrics contributing to tma_resource_bound category", + "tma_retiring_group": "Metrics contributing to tma_retiring category" +} diff --git a/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json new file mode 100644 index 000000000000..bc6a9a4d27a9 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json @@ -0,0 +1,114 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IoBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json b/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json b/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json b/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json new file mode 100644 index 000000000000..a151ba9cccb0 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json @@ -0,0 +1,113 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json new file mode 100644 index 000000000000..bc6a9a4d27a9 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json @@ -0,0 +1,114 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IoBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json b/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json new file mode 100644 index 000000000000..f6a0258e3241 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json @@ -0,0 +1,107 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json b/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json new file mode 100644 index 000000000000..bebb85945d62 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json @@ -0,0 +1,100 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json new file mode 100644 index 000000000000..bebb85945d62 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json @@ -0,0 +1,100 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json new file mode 100644 index 000000000000..e6f7934320bf --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json @@ -0,0 +1,118 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IntVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IoBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_int_operations_group": "Metrics contributing to tma_int_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_bandwidth_group": "Metrics contributing to tma_mem_bandwidth category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json b/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json new file mode 100644 index 000000000000..a151ba9cccb0 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json @@ -0,0 +1,113 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json new file mode 100644 index 000000000000..bc6a9a4d27a9 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json @@ -0,0 +1,114 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IoBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json new file mode 100644 index 000000000000..a151ba9cccb0 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json @@ -0,0 +1,113 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBC": "Metrics related by the issue $issueBC", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} -- cgit v1.2.3 From 237d41d4a2d7d45e41f5450636d1cf689cba0e8a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 14:53:36 -0700 Subject: perf cpumap: Add intersect function The merge function gives the union of two cpu maps. Add an intersect function which is necessary, for example, when intersecting a PMUs supported CPUs with user requested. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230526215410.2435674-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 35 ++++++++++++++++++++++++++++++ tools/lib/perf/include/perf/cpumap.h | 2 ++ tools/perf/tests/builtin-test.c | 1 + tools/perf/tests/cpumap.c | 41 ++++++++++++++++++++++++++++++++++++ tools/perf/tests/tests.h | 1 + 5 files changed, 80 insertions(+) diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 1229b18bcdb1..d4f3a1a12522 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -402,3 +402,38 @@ struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig, perf_cpu_map__put(orig); return merged; } + +struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *orig, + struct perf_cpu_map *other) +{ + struct perf_cpu *tmp_cpus; + int tmp_len; + int i, j, k; + struct perf_cpu_map *merged = NULL; + + if (perf_cpu_map__is_subset(other, orig)) + return perf_cpu_map__get(orig); + if (perf_cpu_map__is_subset(orig, other)) + return perf_cpu_map__get(other); + + tmp_len = max(orig->nr, other->nr); + tmp_cpus = malloc(tmp_len * sizeof(struct perf_cpu)); + if (!tmp_cpus) + return NULL; + + i = j = k = 0; + while (i < orig->nr && j < other->nr) { + if (orig->map[i].cpu < other->map[j].cpu) + i++; + else if (orig->map[i].cpu > other->map[j].cpu) + j++; + else { + j++; + tmp_cpus[k++] = orig->map[i++]; + } + } + if (k) + merged = cpu_map__trim_new(k, tmp_cpus); + free(tmp_cpus); + return merged; +} diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index 8724dde79342..b4c9a827a88a 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -25,6 +25,8 @@ LIBPERF_API struct perf_cpu_map *perf_cpu_map__read(FILE *file); LIBPERF_API struct perf_cpu_map *perf_cpu_map__get(struct perf_cpu_map *map); LIBPERF_API struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig, struct perf_cpu_map *other); +LIBPERF_API struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *orig, + struct perf_cpu_map *other); LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index b89d69afcef0..eef400025fca 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -97,6 +97,7 @@ static struct test_suite *generic_tests[] = { &suite__backward_ring_buffer, &suite__cpu_map_print, &suite__cpu_map_merge, + &suite__cpu_map_intersect, &suite__sdt_event, &suite__is_printable_array, &suite__bitmap_print, diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index b1a924314e09..92232978fe5e 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -171,6 +171,47 @@ static int test__cpu_map_merge(struct test_suite *test __maybe_unused, int subte return 0; } +static int __test__cpu_map_intersect(const char *lhs, const char *rhs, int nr, const char *expected) +{ + struct perf_cpu_map *a = perf_cpu_map__new(lhs); + struct perf_cpu_map *b = perf_cpu_map__new(rhs); + struct perf_cpu_map *c = perf_cpu_map__intersect(a, b); + char buf[100]; + + TEST_ASSERT_EQUAL("failed to intersect map: bad nr", perf_cpu_map__nr(c), nr); + cpu_map__snprint(c, buf, sizeof(buf)); + TEST_ASSERT_VAL("failed to intersect map: bad result", !strcmp(buf, expected)); + perf_cpu_map__put(a); + perf_cpu_map__put(b); + perf_cpu_map__put(c); + return 0; +} + +static int test__cpu_map_intersect(struct test_suite *test __maybe_unused, + int subtest __maybe_unused) +{ + int ret; + + ret = __test__cpu_map_intersect("4,2,1", "4,5,7", 1, "4"); + if (ret) + return ret; + ret = __test__cpu_map_intersect("1-8", "6-9", 3, "6-8"); + if (ret) + return ret; + ret = __test__cpu_map_intersect("1-8,12-20", "6-9,15", 4, "6-8,15"); + if (ret) + return ret; + ret = __test__cpu_map_intersect("4,2,1", "1", 1, "1"); + if (ret) + return ret; + ret = __test__cpu_map_intersect("1", "4,2,1", 1, "1"); + if (ret) + return ret; + ret = __test__cpu_map_intersect("1", "1", 1, "1"); + return ret; +} + DEFINE_SUITE("Synthesize cpu map", cpu_map_synthesize); DEFINE_SUITE("Print cpu map", cpu_map_print); DEFINE_SUITE("Merge cpu map", cpu_map_merge); +DEFINE_SUITE("Intersect cpu map", cpu_map_intersect); diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 9a0f3904e53d..b4e54f08bc39 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -127,6 +127,7 @@ DECLARE_SUITE(event_times); DECLARE_SUITE(backward_ring_buffer); DECLARE_SUITE(cpu_map_print); DECLARE_SUITE(cpu_map_merge); +DECLARE_SUITE(cpu_map_intersect); DECLARE_SUITE(sdt_event); DECLARE_SUITE(is_printable_array); DECLARE_SUITE(bitmap_print); -- cgit v1.2.3 From 5cebb33fd929dc67812072741408dfcd1a7db4a7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 14:53:37 -0700 Subject: perf tests: Organize cpu_map tests into a single suite Go from 4 suites to a single suite with 4 test cases. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230526215410.2435674-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 5 +---- tools/perf/tests/cpumap.c | 16 ++++++++++++---- tools/perf/tests/tests.h | 5 +---- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index eef400025fca..aa44fdc84763 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -88,16 +88,13 @@ static struct test_suite *generic_tests[] = { &suite__bpf, &suite__thread_map_synthesize, &suite__thread_map_remove, - &suite__cpu_map_synthesize, + &suite__cpu_map, &suite__synthesize_stat_config, &suite__synthesize_stat, &suite__synthesize_stat_round, &suite__event_update, &suite__event_times, &suite__backward_ring_buffer, - &suite__cpu_map_print, - &suite__cpu_map_merge, - &suite__cpu_map_intersect, &suite__sdt_event, &suite__is_printable_array, &suite__bitmap_print, diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 92232978fe5e..83805690c209 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -211,7 +211,15 @@ static int test__cpu_map_intersect(struct test_suite *test __maybe_unused, return ret; } -DEFINE_SUITE("Synthesize cpu map", cpu_map_synthesize); -DEFINE_SUITE("Print cpu map", cpu_map_print); -DEFINE_SUITE("Merge cpu map", cpu_map_merge); -DEFINE_SUITE("Intersect cpu map", cpu_map_intersect); +static struct test_case tests__cpu_map[] = { + TEST_CASE("Synthesize cpu map", cpu_map_synthesize), + TEST_CASE("Print cpu map", cpu_map_print), + TEST_CASE("Merge cpu map", cpu_map_merge), + TEST_CASE("Intersect cpu map", cpu_map_intersect), + { .name = NULL, } +}; + +struct test_suite suite__cpu_map = { + .desc = "CPU map", + .test_cases = tests__cpu_map, +}; diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index b4e54f08bc39..f424c0b7f43f 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -118,16 +118,13 @@ DECLARE_SUITE(bpf); DECLARE_SUITE(session_topology); DECLARE_SUITE(thread_map_synthesize); DECLARE_SUITE(thread_map_remove); -DECLARE_SUITE(cpu_map_synthesize); +DECLARE_SUITE(cpu_map); DECLARE_SUITE(synthesize_stat_config); DECLARE_SUITE(synthesize_stat); DECLARE_SUITE(synthesize_stat_round); DECLARE_SUITE(event_update); DECLARE_SUITE(event_times); DECLARE_SUITE(backward_ring_buffer); -DECLARE_SUITE(cpu_map_print); -DECLARE_SUITE(cpu_map_merge); -DECLARE_SUITE(cpu_map_intersect); DECLARE_SUITE(sdt_event); DECLARE_SUITE(is_printable_array); DECLARE_SUITE(bitmap_print); -- cgit v1.2.3 From 540c910c65a94fb4622b9dd03d71adc0e82d94e9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 May 2023 14:06:00 -0700 Subject: perf test: Fix perf stat JSON output test The recent --per-cache option test caused a problem. According to the option name, I think it should check args.per_cache instead of args.per_cache_instance. $ sudo ./perf test -v 99 99: perf stat JSON output linter : --- start --- test child forked, pid 3086101 Checking json output: no args [Success] Checking json output: system wide [Success] Checking json output: interval [Success] Checking json output: event [Success] Checking json output: per thread [Success] Checking json output: per node [Success] Checking json output: system wide no aggregation [Success] Checking json output: per core [Success] Checking json output: per cache_instance Test failed for input: ... Traceback (most recent call last): File "linux/tools/perf/tests/shell/lib/perf_json_output_lint.py", line 88, in elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache_instance: AttributeError: 'Namespace' object has no attribute 'per_cache_instance' test child finished with -1 ---- end ---- perf stat JSON output linter: FAILED! Fixes: bfce728db3179042 ("pert tests: Add tests for new "perf stat --per-cache" aggregation option") Signed-off-by: Namhyung Kim Tested-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230524210600.3095830-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/perf_json_output_lint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py index 4acaaed5560d..b81582a89d36 100644 --- a/tools/perf/tests/shell/lib/perf_json_output_lint.py +++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py @@ -85,7 +85,7 @@ try: expected_items = 7 elif args.interval or args.per_thread or args.system_wide_no_aggr: expected_items = 8 - elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache_instance: + elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache: expected_items = 9 else: # If no option is specified, don't check the number of items. -- cgit v1.2.3 From caa90a7bd3bef1814e680da9e2538c1a813aa8a9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 22:55:17 -0700 Subject: perf test python: Put perf python at start of sys.path This avoids picking up a system installed version of the perf python module. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230527055517.2711487-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/python-use.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c index 6b990ee38575..0ebc22ac8d5b 100644 --- a/tools/perf/tests/python-use.c +++ b/tools/perf/tests/python-use.c @@ -14,7 +14,7 @@ static int test__python_use(struct test_suite *test __maybe_unused, int subtest char *cmd; int ret; - if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s", + if (asprintf(&cmd, "echo \"import sys ; sys.path.insert(0, '%s'); import perf\" | %s %s", PYTHONPATH, PYTHON, verbose > 0 ? "" : "2> /dev/null") < 0) return -1; -- cgit v1.2.3 From 7d1b529f164d33ad4514b272bcec65036873d717 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:37 -0700 Subject: perf cpumap: Add internal nr and cpu accessors These accessors assume the map is non-null. Rewrite functions to use rather than direct accesses. This also fixes a build regression for REFCNT_CHECKING in the intersect function. Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kan Liang Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 74 ++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index d4f3a1a12522..ec3f4ac8b1e2 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -99,6 +99,11 @@ static int cmp_cpu(const void *a, const void *b) return cpu_a->cpu - cpu_b->cpu; } +static struct perf_cpu __perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx) +{ + return RC_CHK_ACCESS(cpus)->map[idx]; +} + static struct perf_cpu_map *cpu_map__trim_new(int nr_cpus, const struct perf_cpu *tmp_cpus) { size_t payload_size = nr_cpus * sizeof(struct perf_cpu); @@ -111,8 +116,12 @@ static struct perf_cpu_map *cpu_map__trim_new(int nr_cpus, const struct perf_cpu /* Remove dups */ j = 0; for (i = 0; i < nr_cpus; i++) { - if (i == 0 || RC_CHK_ACCESS(cpus)->map[i].cpu != RC_CHK_ACCESS(cpus)->map[i - 1].cpu) - RC_CHK_ACCESS(cpus)->map[j++].cpu = RC_CHK_ACCESS(cpus)->map[i].cpu; + if (i == 0 || + __perf_cpu_map__cpu(cpus, i).cpu != + __perf_cpu_map__cpu(cpus, i - 1).cpu) { + RC_CHK_ACCESS(cpus)->map[j++].cpu = + __perf_cpu_map__cpu(cpus, i).cpu; + } } perf_cpu_map__set_nr(cpus, j); assert(j <= nr_cpus); @@ -269,26 +278,31 @@ out: return cpus; } +static int __perf_cpu_map__nr(const struct perf_cpu_map *cpus) +{ + return RC_CHK_ACCESS(cpus)->nr; +} + struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx) { struct perf_cpu result = { .cpu = -1 }; - if (cpus && idx < RC_CHK_ACCESS(cpus)->nr) - return RC_CHK_ACCESS(cpus)->map[idx]; + if (cpus && idx < __perf_cpu_map__nr(cpus)) + return __perf_cpu_map__cpu(cpus, idx); return result; } int perf_cpu_map__nr(const struct perf_cpu_map *cpus) { - return cpus ? RC_CHK_ACCESS(cpus)->nr : 1; + return cpus ? __perf_cpu_map__nr(cpus) : 1; } bool perf_cpu_map__empty(const struct perf_cpu_map *map) { - return map ? RC_CHK_ACCESS(map)->map[0].cpu == -1 : true; + return map ? __perf_cpu_map__cpu(map, 0).cpu == -1 : true; } int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu) @@ -299,10 +313,10 @@ int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu) return -1; low = 0; - high = RC_CHK_ACCESS(cpus)->nr; + high = __perf_cpu_map__nr(cpus); while (low < high) { int idx = (low + high) / 2; - struct perf_cpu cpu_at_idx = RC_CHK_ACCESS(cpus)->map[idx]; + struct perf_cpu cpu_at_idx = __perf_cpu_map__cpu(cpus, idx); if (cpu_at_idx.cpu == cpu.cpu) return idx; @@ -328,7 +342,9 @@ struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map) }; // cpu_map__trim_new() qsort()s it, cpu_map__default_new() sorts it as well. - return RC_CHK_ACCESS(map)->nr > 0 ? RC_CHK_ACCESS(map)->map[RC_CHK_ACCESS(map)->nr - 1] : result; + return __perf_cpu_map__nr(map) > 0 + ? __perf_cpu_map__cpu(map, __perf_cpu_map__nr(map) - 1) + : result; } /** Is 'b' a subset of 'a'. */ @@ -336,15 +352,15 @@ bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu { if (a == b || !b) return true; - if (!a || RC_CHK_ACCESS(b)->nr > RC_CHK_ACCESS(a)->nr) + if (!a || __perf_cpu_map__nr(b) > __perf_cpu_map__nr(a)) return false; - for (int i = 0, j = 0; i < RC_CHK_ACCESS(a)->nr; i++) { - if (RC_CHK_ACCESS(a)->map[i].cpu > RC_CHK_ACCESS(b)->map[j].cpu) + for (int i = 0, j = 0; i < __perf_cpu_map__nr(a); i++) { + if (__perf_cpu_map__cpu(a, i).cpu > __perf_cpu_map__cpu(b, j).cpu) return false; - if (RC_CHK_ACCESS(a)->map[i].cpu == RC_CHK_ACCESS(b)->map[j].cpu) { + if (__perf_cpu_map__cpu(a, i).cpu == __perf_cpu_map__cpu(b, j).cpu) { j++; - if (j == RC_CHK_ACCESS(b)->nr) + if (j == __perf_cpu_map__nr(b)) return true; } } @@ -374,27 +390,27 @@ struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig, return perf_cpu_map__get(other); } - tmp_len = RC_CHK_ACCESS(orig)->nr + RC_CHK_ACCESS(other)->nr; + tmp_len = __perf_cpu_map__nr(orig) + __perf_cpu_map__nr(other); tmp_cpus = malloc(tmp_len * sizeof(struct perf_cpu)); if (!tmp_cpus) return NULL; /* Standard merge algorithm from wikipedia */ i = j = k = 0; - while (i < RC_CHK_ACCESS(orig)->nr && j < RC_CHK_ACCESS(other)->nr) { - if (RC_CHK_ACCESS(orig)->map[i].cpu <= RC_CHK_ACCESS(other)->map[j].cpu) { - if (RC_CHK_ACCESS(orig)->map[i].cpu == RC_CHK_ACCESS(other)->map[j].cpu) + while (i < __perf_cpu_map__nr(orig) && j < __perf_cpu_map__nr(other)) { + if (__perf_cpu_map__cpu(orig, i).cpu <= __perf_cpu_map__cpu(other, j).cpu) { + if (__perf_cpu_map__cpu(orig, i).cpu == __perf_cpu_map__cpu(other, j).cpu) j++; - tmp_cpus[k++] = RC_CHK_ACCESS(orig)->map[i++]; + tmp_cpus[k++] = __perf_cpu_map__cpu(orig, i++); } else - tmp_cpus[k++] = RC_CHK_ACCESS(other)->map[j++]; + tmp_cpus[k++] = __perf_cpu_map__cpu(other, j++); } - while (i < RC_CHK_ACCESS(orig)->nr) - tmp_cpus[k++] = RC_CHK_ACCESS(orig)->map[i++]; + while (i < __perf_cpu_map__nr(orig)) + tmp_cpus[k++] = __perf_cpu_map__cpu(orig, i++); - while (j < RC_CHK_ACCESS(other)->nr) - tmp_cpus[k++] = RC_CHK_ACCESS(other)->map[j++]; + while (j < __perf_cpu_map__nr(other)) + tmp_cpus[k++] = __perf_cpu_map__cpu(other, j++); assert(k <= tmp_len); merged = cpu_map__trim_new(k, tmp_cpus); @@ -416,20 +432,20 @@ struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *orig, if (perf_cpu_map__is_subset(orig, other)) return perf_cpu_map__get(other); - tmp_len = max(orig->nr, other->nr); + tmp_len = max(__perf_cpu_map__nr(orig), __perf_cpu_map__nr(other)); tmp_cpus = malloc(tmp_len * sizeof(struct perf_cpu)); if (!tmp_cpus) return NULL; i = j = k = 0; - while (i < orig->nr && j < other->nr) { - if (orig->map[i].cpu < other->map[j].cpu) + while (i < __perf_cpu_map__nr(orig) && j < __perf_cpu_map__nr(other)) { + if (__perf_cpu_map__cpu(orig, i).cpu < __perf_cpu_map__cpu(other, j).cpu) i++; - else if (orig->map[i].cpu > other->map[j].cpu) + else if (__perf_cpu_map__cpu(orig, i).cpu > __perf_cpu_map__cpu(other, j).cpu) j++; else { j++; - tmp_cpus[k++] = orig->map[i++]; + tmp_cpus[k++] = __perf_cpu_map__cpu(orig, i++); } } if (k) -- cgit v1.2.3 From 74c075cab1e793fe8418f15eb6e6c88d2197ce1d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:38 -0700 Subject: perf cpumap: Add equal function Equality is a useful property to compare after merging and intersecting maps. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 21 ++++++++++++++++++++ tools/lib/perf/include/perf/cpumap.h | 2 ++ tools/perf/tests/cpumap.c | 37 ++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index ec3f4ac8b1e2..98d7cb24a158 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -335,6 +335,27 @@ bool perf_cpu_map__has(const struct perf_cpu_map *cpus, struct perf_cpu cpu) return perf_cpu_map__idx(cpus, cpu) != -1; } +bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_map *rhs) +{ + int nr; + + if (lhs == rhs) + return true; + + if (!lhs || !rhs) + return false; + + nr = __perf_cpu_map__nr(lhs); + if (nr != __perf_cpu_map__nr(rhs)) + return false; + + for (int idx = 0; idx < nr; idx++) { + if (__perf_cpu_map__cpu(lhs, idx).cpu != __perf_cpu_map__cpu(rhs, idx).cpu) + return false; + } + return true; +} + struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map) { struct perf_cpu result = { diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index b4c9a827a88a..cedfc26d944e 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -33,6 +33,8 @@ LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map); LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu); +LIBPERF_API bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, + const struct perf_cpu_map *rhs); #define perf_cpu_map__for_each_cpu(cpu, idx, cpus) \ for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx); \ diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 83805690c209..7730fc2ab40b 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -211,11 +211,48 @@ static int test__cpu_map_intersect(struct test_suite *test __maybe_unused, return ret; } +static int test__cpu_map_equal(struct test_suite *test __maybe_unused, int subtest __maybe_unused) +{ + struct perf_cpu_map *any = perf_cpu_map__dummy_new(); + struct perf_cpu_map *one = perf_cpu_map__new("1"); + struct perf_cpu_map *two = perf_cpu_map__new("2"); + struct perf_cpu_map *empty = perf_cpu_map__intersect(one, two); + struct perf_cpu_map *pair = perf_cpu_map__new("1-2"); + struct perf_cpu_map *tmp; + struct perf_cpu_map *maps[] = {empty, any, one, two, pair}; + + for (size_t i = 0; i < ARRAY_SIZE(maps); i++) { + /* Maps equal themself. */ + TEST_ASSERT_VAL("equal", perf_cpu_map__equal(maps[i], maps[i])); + for (size_t j = 0; j < ARRAY_SIZE(maps); j++) { + /* Maps dont't equal each other. */ + if (i == j) + continue; + TEST_ASSERT_VAL("not equal", !perf_cpu_map__equal(maps[i], maps[j])); + } + } + + /* Maps equal made maps. */ + tmp = perf_cpu_map__merge(perf_cpu_map__get(one), two); + TEST_ASSERT_VAL("pair", perf_cpu_map__equal(pair, tmp)); + perf_cpu_map__put(tmp); + + tmp = perf_cpu_map__intersect(pair, one); + TEST_ASSERT_VAL("one", perf_cpu_map__equal(one, tmp)); + perf_cpu_map__put(tmp); + + for (size_t i = 0; i < ARRAY_SIZE(maps); i++) + perf_cpu_map__put(maps[i]); + + return TEST_OK; +} + static struct test_case tests__cpu_map[] = { TEST_CASE("Synthesize cpu map", cpu_map_synthesize), TEST_CASE("Print cpu map", cpu_map_print), TEST_CASE("Merge cpu map", cpu_map_merge), TEST_CASE("Intersect cpu map", cpu_map_intersect), + TEST_CASE("Equal cpu map", cpu_map_equal), { .name = NULL, } }; -- cgit v1.2.3 From 916ce34ac9f542a828293da171a442a497d1f9d2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:39 -0700 Subject: libperf cpumap: Add "any CPU"/dummy test function It is common in the code currently to test a map for "empty" when in fact the "any CPU"/dummy value of -1 is being sought. Add a new function to enable this and document the behavior of two other functions. The term "any CPU" comes from perf_event_open, where the value is consumed, but it is more typical in the code to see this value/map referred to as the dummy value. This could be misleading due to the dummy event and also dummy not being intention revealing, so it is hoped to migrate the code to referring to this as "any CPU". Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 5 +++++ tools/lib/perf/include/perf/cpumap.h | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 98d7cb24a158..2a5a29217374 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -356,6 +356,11 @@ bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_m return true; } +bool perf_cpu_map__has_any_cpu(const struct perf_cpu_map *map) +{ + return map && __perf_cpu_map__cpu(map, 0).cpu == -1; +} + struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map) { struct perf_cpu result = { diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index cedfc26d944e..e38d859a384d 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -18,6 +18,9 @@ struct perf_cache { struct perf_cpu_map; +/** + * perf_cpu_map__dummy_new - a map with a singular "any CPU"/dummy -1 value. + */ LIBPERF_API struct perf_cpu_map *perf_cpu_map__dummy_new(void); LIBPERF_API struct perf_cpu_map *perf_cpu_map__default_new(void); LIBPERF_API struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list); @@ -30,11 +33,18 @@ LIBPERF_API struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *or LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); +/** + * perf_cpu_map__empty - is map either empty or the "any CPU"/dummy value. + */ LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map); LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu); LIBPERF_API bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_map *rhs); +/** + * perf_cpu_map__any_cpu - Does the map contain the "any CPU"/dummy -1 value? + */ +LIBPERF_API bool perf_cpu_map__has_any_cpu(const struct perf_cpu_map *map); #define perf_cpu_map__for_each_cpu(cpu, idx, cpus) \ for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx); \ -- cgit v1.2.3 From 4bf7e81aadfdfb6f2e5197c5d3cf50ab9ddfb286 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:40 -0700 Subject: perf pmu: Detect ARM and hybrid PMUs with sysfs is_arm_pmu_core detects a core PMU via the presence of a "cpus" file rather than a "cpumask" file. This pattern holds for hybrid PMUs so rename the function and remove redundant perf_pmu__is_hybrid tests. Add a new helper is_pmu_hybrid similar to is_pmu_core. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 29 ++++++++++++++++++----------- tools/perf/util/pmu.h | 1 + 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index f4f0afbc391c..7392cec725bf 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -643,12 +643,14 @@ static char *pmu_id(const char *name) return str; } -/* - * PMU CORE devices have different name other than cpu in sysfs on some - * platforms. - * Looking for possible sysfs files to identify the arm core device. +/** + * is_sysfs_pmu_core() - PMU CORE devices have different name other than cpu in + * sysfs on some platforms like ARM or Intel hybrid. Looking for + * possible the cpus file in sysfs files to identify whether this is a + * core device. + * @name: The PMU name such as "cpu_atom". */ -static int is_arm_pmu_core(const char *name) +static int is_sysfs_pmu_core(const char *name) { char path[PATH_MAX]; @@ -814,7 +816,7 @@ void pmu_add_cpu_aliases_table(struct list_head *head, struct perf_pmu *pmu, struct pmu_add_cpu_aliases_map_data data = { .head = head, .name = pmu->name, - .cpu_name = is_arm_pmu_core(pmu->name) ? pmu->name : "cpu", + .cpu_name = is_sysfs_pmu_core(pmu->name) ? pmu->name : "cpu", .pmu = pmu, }; @@ -1647,22 +1649,27 @@ static int cmp_sevent(const void *a, const void *b) bool is_pmu_core(const char *name) { - return !strcmp(name, "cpu") || is_arm_pmu_core(name); + return !strcmp(name, "cpu") || is_sysfs_pmu_core(name); +} + +bool is_pmu_hybrid(const char *name) +{ + return !strcmp(name, "cpu_atom") || !strcmp(name, "cpu_core"); } bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) { - return is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); + return is_pmu_core(pmu->name); } bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu) { - return is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); + return is_pmu_core(pmu->name); } bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) { - return !perf_pmu__is_hybrid(pmu->name); + return !is_pmu_hybrid(pmu->name); } static bool pmu_alias_is_duplicate(struct sevent *alias_a, @@ -1716,7 +1723,7 @@ void print_pmu_events(const struct print_callbacks *print_cb, void *print_state) pmu = NULL; j = 0; while ((pmu = perf_pmu__scan(pmu)) != NULL) { - bool is_cpu = is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name); + bool is_cpu = is_pmu_core(pmu->name); list_for_each_entry(event, &pmu->aliases, list) { aliases[j].event = event; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 0e0cb6283594..f50919f1b34c 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -220,6 +220,7 @@ void perf_pmu__del_formats(struct list_head *formats); struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu); bool is_pmu_core(const char *name); +bool is_pmu_hybrid(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu); bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); -- cgit v1.2.3 From e20d1f2fa29707d1fad7a667737257b9494043fd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:41 -0700 Subject: perf pmu: Add is_core to pmu Cache is_pmu_core in the pmu to avoid recomputation. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 7 ++++--- tools/perf/util/pmu.h | 7 +++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7392cec725bf..e8c0762c311a 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -952,6 +952,7 @@ static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) } pmu->type = type; + pmu->is_core = is_pmu_core(name); pmu->is_uncore = pmu_is_uncore(dirfd, name); if (pmu->is_uncore) pmu->id = pmu_id(name); @@ -1659,12 +1660,12 @@ bool is_pmu_hybrid(const char *name) bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) { - return is_pmu_core(pmu->name); + return pmu->is_core; } bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu) { - return is_pmu_core(pmu->name); + return pmu->is_core; } bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) @@ -1723,7 +1724,7 @@ void print_pmu_events(const struct print_callbacks *print_cb, void *print_state) pmu = NULL; j = 0; while ((pmu = perf_pmu__scan(pmu)) != NULL) { - bool is_cpu = is_pmu_core(pmu->name); + bool is_cpu = pmu->is_core; list_for_each_entry(event, &pmu->aliases, list) { aliases[j].event = event; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index f50919f1b34c..96236a79c6fd 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -59,6 +59,13 @@ struct perf_pmu { * @selectable: Can the PMU name be selected as if it were an event? */ bool selectable; + /** + * @is_core: Is the PMU the core CPU PMU? Determined by the name being + * "cpu" or by the presence of + * /bus/event_source/devices//cpus. There may be >1 core + * PMU on systems like Intel hybrid. + */ + bool is_core; /** * @is_uncore: Is the PMU not within the CPU core? Determined by the * presence of /bus/event_source/devices//cpumask. -- cgit v1.2.3 From 1578e63d3ac292abb95767ec197a4ddd094523ce Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:42 -0700 Subject: perf evsel: Add is_pmu_core inorder to interpret own_cpus The behaviour of handling cpu maps varies for core and other PMUs. For core PMUs the cpu map lists all valid CPUs, whereas for other PMUs the map is the default CPU. Add a flag in the evsel to indicate if a PMU is core to help with later interpreting of the cpu maps and populate it when the evsel is created during parsing. When propagating cpu maps, core PMUs should intersect the cpu map of the PMU with the user requested one. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/internal/evsel.h | 9 +++++++++ tools/perf/util/evsel.c | 1 + tools/perf/util/parse-events.c | 1 + 3 files changed, 11 insertions(+) diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h index a99a75d9e78f..4d6f2a032f45 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -41,7 +41,14 @@ struct perf_sample_id { struct perf_evsel { struct list_head node; struct perf_event_attr attr; + /** The commonly used cpu map of CPUs the event should be opened upon, etc. */ struct perf_cpu_map *cpus; + /** + * The cpu map read from the PMU. For core PMUs this is the list of all + * CPUs the event can be opened upon. For other PMUs this is the default + * cpu map for opening the event on, for example, the first CPU on a + * socket for an uncore event. + */ struct perf_cpu_map *own_cpus; struct perf_thread_map *threads; struct xyarray *fd; @@ -65,6 +72,8 @@ struct perf_evsel { * i.e. it cannot be the 'any CPU' value of -1. */ bool requires_cpu; + /** Is the PMU for the event a core one? Effects the handling of own_cpus. */ + bool is_pmu_core; int idx; }; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2f5910b31fa9..8c8f371ea2b5 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -415,6 +415,7 @@ struct evsel *evsel__clone(struct evsel *orig) evsel->core.nr_members = orig->core.nr_members; evsel->core.system_wide = orig->core.system_wide; evsel->core.requires_cpu = orig->core.requires_cpu; + evsel->core.is_pmu_core = orig->core.is_pmu_core; if (orig->name) { evsel->name = strdup(orig->name); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index b93264f8a37c..1a0be395c887 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -260,6 +260,7 @@ __add_event(struct list_head *list, int *idx, evsel->core.cpus = cpus; evsel->core.own_cpus = perf_cpu_map__get(cpus); evsel->core.requires_cpu = pmu ? pmu->is_uncore : false; + evsel->core.is_pmu_core = pmu ? pmu->is_core : false; evsel->auto_merge_stats = auto_merge_stats; evsel->pmu = pmu; evsel->pmu_name = pmu && pmu->name ? strdup(pmu->name) : NULL; -- cgit v1.2.3 From a0c41caebab2fa224454d50dd4e29ae008ead25f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:43 -0700 Subject: perf pmu: Add CPU map for "cpu" PMUs A typical "cpu" PMU has no "cpus" or "cpumask" file meaning the CPU map is set to NULL, which also encodes an empty CPU map. Update pmu_cpumask so that if the "cpu" PMU fails to load a CPU map, use a default of all online PMUs. Remove const from cpu_map__online for the sake of reference counting. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 4 ++-- tools/perf/util/cpumap.h | 4 ++-- tools/perf/util/pmu.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index a0719816a218..0e090e8bc334 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -667,9 +667,9 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size) return ptr - buf; } -const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ +struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ { - static const struct perf_cpu_map *online = NULL; + static struct perf_cpu_map *online; if (!online) online = perf_cpu_map__new(NULL); /* from /sys/devices/system/cpu/online */ diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index f394ccc0ccfb..9df2aeb34d3d 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -55,7 +55,7 @@ struct perf_cpu_map *cpu_map__new_data(const struct perf_record_cpu_map_data *da size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp); -const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */ +struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */ int cpu__setup_cpunode_map(void); @@ -66,7 +66,7 @@ struct perf_cpu cpu__max_present_cpu(void); /** * cpu_map__is_dummy - Events associated with a pid, rather than a CPU, use a single dummy map with an entry of -1. */ -static inline bool cpu_map__is_dummy(struct perf_cpu_map *cpus) +static inline bool cpu_map__is_dummy(const struct perf_cpu_map *cpus) { return perf_cpu_map__nr(cpus) == 1 && perf_cpu_map__cpu(cpus, 0).cpu == -1; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index e8c0762c311a..d992f5242d99 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -610,7 +610,7 @@ static struct perf_cpu_map *pmu_cpumask(int dirfd, const char *name) return cpus; } - return NULL; + return !strcmp(name, "cpu") ? perf_cpu_map__get(cpu_map__online()) : NULL; } static bool pmu_is_uncore(int dirfd, const char *name) -- cgit v1.2.3 From ef91871c960ed1e9e790ed66840835fac87614b7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:44 -0700 Subject: perf evlist: Propagate user CPU maps intersecting core PMU maps The CPU map for a non-core PMU gives a default CPU value for perf_event_open. For core PMUs the CPU map lists all CPUs the evsel may be opened on. If there are >1 core PMU, the CPU maps will list the CPUs for that core PMU, but the user_requested_cpus may contain CPUs that are invalid for the PMU and cause perf_event_open to fail. To avoid this, when propagating the CPU map for core PMUs intersect it with the CPU map of the PMU (the evsel's "own_cpus"). Add comments to __perf_evlist__propagate_maps to explain its somewhat complex behavior. Fix the related comments for system_wide in struct perf_evsel. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/evlist.c | 25 ++++++++++++++++++++----- tools/lib/perf/include/internal/evsel.h | 6 +++--- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 81e8b5fcd8ba..b8b066d0dc5e 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -36,18 +36,33 @@ void perf_evlist__init(struct perf_evlist *evlist) static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, struct perf_evsel *evsel) { - /* - * We already have cpus for evsel (via PMU sysfs) so - * keep it, if there's no target cpu list defined. - */ if (evsel->system_wide) { + /* System wide: set the cpu map of the evsel to all online CPUs. */ perf_cpu_map__put(evsel->cpus); evsel->cpus = perf_cpu_map__new(NULL); + } else if (evlist->has_user_cpus && evsel->is_pmu_core) { + /* + * User requested CPUs on a core PMU, ensure the requested CPUs + * are valid by intersecting with those of the PMU. + */ + perf_cpu_map__put(evsel->cpus); + evsel->cpus = perf_cpu_map__intersect(evlist->user_requested_cpus, evsel->own_cpus); } else if (!evsel->own_cpus || evlist->has_user_cpus || - (!evsel->requires_cpu && perf_cpu_map__empty(evlist->user_requested_cpus))) { + (!evsel->requires_cpu && perf_cpu_map__has_any_cpu(evlist->user_requested_cpus))) { + /* + * The PMU didn't specify a default cpu map, this isn't a core + * event and the user requested CPUs or the evlist user + * requested CPUs have the "any CPU" (aka dummy) CPU value. In + * which case use the user requested CPUs rather than the PMU + * ones. + */ perf_cpu_map__put(evsel->cpus); evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus); } else if (evsel->cpus != evsel->own_cpus) { + /* + * No user requested cpu map but the PMU cpu map doesn't match + * the evsel's. Reset it back to the PMU cpu map. + */ perf_cpu_map__put(evsel->cpus); evsel->cpus = perf_cpu_map__get(evsel->own_cpus); } diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h index 4d6f2a032f45..5cd220a61962 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -62,9 +62,9 @@ struct perf_evsel { int nr_members; /* * system_wide is for events that need to be on every CPU, irrespective - * of user requested CPUs or threads. Map propagation will set cpus to - * this event's own_cpus, whereby they will contribute to evlist - * all_cpus. + * of user requested CPUs or threads. Tha main example of this is the + * dummy event. Map propagation will set cpus for this event to all CPUs + * as software PMU events like dummy, have a CPU map that is empty. */ bool system_wide; /* -- cgit v1.2.3 From 42249160cc6837396acf3358bc724612ce24d035 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:45 -0700 Subject: perf evlist: Allow has_user_cpus to be set on hybrid Now that CPU map propagation only sets valid CPUs for core PMUs, there is no reason to disable "has_user_cpus" for hybrid. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a0504316b06f..2e2c3509bec3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1067,7 +1067,7 @@ int evlist__create_maps(struct evlist *evlist, struct target *target) if (!cpus) goto out_delete_threads; - evlist->core.has_user_cpus = !!target->cpu_list && !target->hybrid; + evlist->core.has_user_cpus = !!target->cpu_list; perf_evlist__set_maps(&evlist->core, cpus, threads); -- cgit v1.2.3 From 8ec984d53714dfa538f3f5b1e22a309ac18edf63 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:46 -0700 Subject: perf target: Remove unused hybrid value Previously this was used to modify CPU map propagation, but it is now unnecessary as map propagation ensure core PMUs only have valid PMUs in the CPU map from user requested CPUs. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 -- tools/perf/builtin-stat.c | 1 - tools/perf/util/target.h | 1 - 3 files changed, 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index ec0f2d5f189f..d152ab04a209 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -4204,8 +4204,6 @@ int cmd_record(int argc, const char **argv) goto out; } - rec->opts.target.hybrid = perf_pmu__has_hybrid(); - if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP) arch__add_leaf_frame_record_opts(&rec->opts); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 176deeb8ee66..8d4c4f4ca8ea 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2730,7 +2730,6 @@ int cmd_stat(int argc, const char **argv) goto out; } - target.hybrid = perf_pmu__has_hybrid(); if (evlist__create_maps(evsel_list, &target) < 0) { if (target__has_task(&target)) { pr_err("Problems finding threads of monitor\n"); diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h index 880f1af7f6ad..d582cae8e105 100644 --- a/tools/perf/util/target.h +++ b/tools/perf/util/target.h @@ -17,7 +17,6 @@ struct target { bool default_per_cpu; bool per_thread; bool use_bpf; - bool hybrid; int initial_delay; const char *attr_map; }; -- cgit v1.2.3 From 5ac72634482143a8be5e04e5d09a2026f6a94315 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:47 -0700 Subject: perf tools: Warn if no user requested CPUs match PMU's CPUs In commit 1d3351e631fc ("perf tools: Enable on a list of CPUs for hybrid") perf on hybrid will warn if a user requested CPU doesn't match the PMU of the given event but only for hybrid PMUs. Make the logic generic for all PMUs and remove the hybrid logic. Warn if a CPU is requested that isn't present/offline for events not on the core. Warn if a CPU is requested for a core PMU, but the CPU isn't within the cpu map of that PMU. For example on a 16 (0-15) CPU system: ``` $ perf stat -e imc_free_running/data_read/,cycles -C 16 true WARNING: A requested CPU in '16' is not supported by PMU 'uncore_imc_free_running_1' (CPUs 0-15) for event 'imc_free_running/data_read/' WARNING: A requested CPU in '16' is not supported by PMU 'uncore_imc_free_running_0' (CPUs 0-15) for event 'imc_free_running/data_read/' WARNING: A requested CPU in '16' is not supported by PMU 'cpu' (CPUs 0-15) for event 'cycles' Performance counter stats for 'CPU(s) 16': MiB imc_free_running/data_read/ cycles 0.000575312 seconds time elapsed ``` Remove evlist__fix_hybrid_cpus that previously produced the warnings and also perf_pmu__cpus_match that worked with evlist__fix_hybrid_cpus to change CPU maps for hybrid CPUs, something that is no longer necessary as CPU map propagation properly intersects user requested CPUs with the core PMU's CPU map. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 6 +--- tools/perf/builtin-stat.c | 5 +-- tools/perf/util/evlist-hybrid.c | 74 ----------------------------------------- tools/perf/util/evlist-hybrid.h | 1 - tools/perf/util/evlist.c | 39 ++++++++++++++++++++++ tools/perf/util/evlist.h | 2 ++ tools/perf/util/pmu.c | 33 ------------------ tools/perf/util/pmu.h | 4 --- 8 files changed, 43 insertions(+), 121 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d152ab04a209..88f7b4241153 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -4198,11 +4198,7 @@ int cmd_record(int argc, const char **argv) /* Enable ignoring missing threads when -u/-p option is defined. */ rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid; - if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) { - pr_err("failed to use cpu list %s\n", - rec->opts.target.cpu_list); - goto out; - } + evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list); if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP) arch__add_leaf_frame_record_opts(&rec->opts); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 8d4c4f4ca8ea..84d304cffd2c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2725,10 +2725,7 @@ int cmd_stat(int argc, const char **argv) } } - if (evlist__fix_hybrid_cpus(evsel_list, target.cpu_list)) { - pr_err("failed to use cpu list %s\n", target.cpu_list); - goto out; - } + evlist__warn_user_requested_cpus(evsel_list, target.cpu_list); if (evlist__create_maps(evsel_list, &target) < 0) { if (target__has_task(&target)) { diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index 57f02beef023..db3f5fbdebe1 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -86,77 +86,3 @@ bool evlist__has_hybrid(struct evlist *evlist) return false; } - -int evlist__fix_hybrid_cpus(struct evlist *evlist, const char *cpu_list) -{ - struct perf_cpu_map *cpus; - struct evsel *evsel, *tmp; - struct perf_pmu *pmu; - int ret, unmatched_count = 0, events_nr = 0; - - if (!perf_pmu__has_hybrid() || !cpu_list) - return 0; - - cpus = perf_cpu_map__new(cpu_list); - if (!cpus) - return -1; - - /* - * The evsels are created with hybrid pmu's cpus. But now we - * need to check and adjust the cpus of evsel by cpu_list because - * cpu_list may cause conflicts with cpus of evsel. For example, - * cpus of evsel is cpu0-7, but the cpu_list is cpu6-8, we need - * to adjust the cpus of evsel to cpu6-7. And then propatate maps - * in evlist__create_maps(). - */ - evlist__for_each_entry_safe(evlist, tmp, evsel) { - struct perf_cpu_map *matched_cpus, *unmatched_cpus; - char buf1[128], buf2[128]; - - pmu = perf_pmu__find_hybrid_pmu(evsel->pmu_name); - if (!pmu) - continue; - - ret = perf_pmu__cpus_match(pmu, cpus, &matched_cpus, - &unmatched_cpus); - if (ret) - goto out; - - events_nr++; - - if (perf_cpu_map__nr(matched_cpus) > 0 && - (perf_cpu_map__nr(unmatched_cpus) > 0 || - perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(cpus) || - perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(pmu->cpus))) { - perf_cpu_map__put(evsel->core.cpus); - perf_cpu_map__put(evsel->core.own_cpus); - evsel->core.cpus = perf_cpu_map__get(matched_cpus); - evsel->core.own_cpus = perf_cpu_map__get(matched_cpus); - - if (perf_cpu_map__nr(unmatched_cpus) > 0) { - cpu_map__snprint(matched_cpus, buf1, sizeof(buf1)); - pr_warning("WARNING: use %s in '%s' for '%s', skip other cpus in list.\n", - buf1, pmu->name, evsel->name); - } - } - - if (perf_cpu_map__nr(matched_cpus) == 0) { - evlist__remove(evlist, evsel); - evsel__delete(evsel); - - cpu_map__snprint(cpus, buf1, sizeof(buf1)); - cpu_map__snprint(pmu->cpus, buf2, sizeof(buf2)); - pr_warning("WARNING: %s isn't a '%s', please use a CPU list in the '%s' range (%s)\n", - buf1, pmu->name, pmu->name, buf2); - unmatched_count++; - } - - perf_cpu_map__put(matched_cpus); - perf_cpu_map__put(unmatched_cpus); - } - if (events_nr) - ret = (unmatched_count == events_nr) ? -1 : 0; -out: - perf_cpu_map__put(cpus); - return ret; -} diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h index aacdb1b0f948..19f74b4c340a 100644 --- a/tools/perf/util/evlist-hybrid.h +++ b/tools/perf/util/evlist-hybrid.h @@ -10,6 +10,5 @@ int evlist__add_default_hybrid(struct evlist *evlist, bool precise); void evlist__warn_hybrid_group(struct evlist *evlist); bool evlist__has_hybrid(struct evlist *evlist); -int evlist__fix_hybrid_cpus(struct evlist *evlist, const char *cpu_list); #endif /* __PERF_EVLIST_HYBRID_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 2e2c3509bec3..9dfa977193b3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -2465,3 +2465,42 @@ void evlist__check_mem_load_aux(struct evlist *evlist) } } } + +/** + * evlist__warn_user_requested_cpus() - Check each evsel against requested CPUs + * and warn if the user CPU list is inapplicable for the event's PMU's + * CPUs. Not core PMUs list a CPU in sysfs, but this may be overwritten by a + * user requested CPU and so any online CPU is applicable. Core PMUs handle + * events on the CPUs in their list and otherwise the event isn't supported. + * @evlist: The list of events being checked. + * @cpu_list: The user provided list of CPUs. + */ +void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_list) +{ + struct perf_cpu_map *user_requested_cpus; + struct evsel *pos; + + if (!cpu_list) + return; + + user_requested_cpus = perf_cpu_map__new(cpu_list); + if (!user_requested_cpus) + return; + + evlist__for_each_entry(evlist, pos) { + struct perf_cpu_map *intersect, *to_test; + const struct perf_pmu *pmu = evsel__find_pmu(pos); + + to_test = pmu && pmu->is_core ? pmu->cpus : cpu_map__online(); + intersect = perf_cpu_map__intersect(to_test, user_requested_cpus); + if (!perf_cpu_map__equal(intersect, user_requested_cpus)) { + char buf[128]; + + cpu_map__snprint(to_test, buf, sizeof(buf)); + pr_warning("WARNING: A requested CPU in '%s' is not supported by PMU '%s' (CPUs %s) for event '%s'\n", + cpu_list, pmu ? pmu->name : "cpu", buf, evsel__name(pos)); + } + perf_cpu_map__put(intersect); + } + perf_cpu_map__put(user_requested_cpus); +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index e7e5540cc970..5e7ff44f3043 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -447,4 +447,6 @@ struct evsel *evlist__find_evsel(struct evlist *evlist, int idx); int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf); void evlist__check_mem_load_aux(struct evlist *evlist); +void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_list); + #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index d992f5242d99..cd94abe7a87a 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -2046,39 +2046,6 @@ int perf_pmu__match(char *pattern, char *name, char *tok) return 0; } -int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, - struct perf_cpu_map **mcpus_ptr, - struct perf_cpu_map **ucpus_ptr) -{ - struct perf_cpu_map *pmu_cpus = pmu->cpus; - struct perf_cpu_map *matched_cpus, *unmatched_cpus; - struct perf_cpu cpu; - int i, matched_nr = 0, unmatched_nr = 0; - - matched_cpus = perf_cpu_map__default_new(); - if (!matched_cpus) - return -1; - - unmatched_cpus = perf_cpu_map__default_new(); - if (!unmatched_cpus) { - perf_cpu_map__put(matched_cpus); - return -1; - } - - perf_cpu_map__for_each_cpu(cpu, i, cpus) { - if (!perf_cpu_map__has(pmu_cpus, cpu)) - RC_CHK_ACCESS(unmatched_cpus)->map[unmatched_nr++] = cpu; - else - RC_CHK_ACCESS(matched_cpus)->map[matched_nr++] = cpu; - } - - perf_cpu_map__set_nr(unmatched_cpus, unmatched_nr); - perf_cpu_map__set_nr(matched_cpus, matched_nr); - *mcpus_ptr = matched_cpus; - *ucpus_ptr = unmatched_cpus; - return 0; -} - double __weak perf_pmu__cpu_slots_per_cycle(void) { return NAN; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 96236a79c6fd..af10d137e2b5 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -265,10 +265,6 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu); bool perf_pmu__has_hybrid(void); int perf_pmu__match(char *pattern, char *name, char *tok); -int perf_pmu__cpus_match(struct perf_pmu *pmu, struct perf_cpu_map *cpus, - struct perf_cpu_map **mcpus_ptr, - struct perf_cpu_map **ucpus_ptr); - char *pmu_find_real_name(const char *name); char *pmu_find_alias_name(const char *name); double perf_pmu__cpu_slots_per_cycle(void); -- cgit v1.2.3 From b4388dfa3ae5aca7d4d3bbc9b80fe5e483ef78e9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:48 -0700 Subject: perf evlist: Remove evlist__warn_hybrid_group Parse events now corrects PMU groups in parse_events__sort_events_and_fix_groups and so this warning is no longer possible. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 3 --- tools/perf/util/evlist-hybrid.c | 32 -------------------------------- tools/perf/util/evlist-hybrid.h | 1 - 3 files changed, 36 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 84d304cffd2c..d414ee30dcf9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -189,9 +189,6 @@ static void evlist__check_cpu_maps(struct evlist *evlist) { struct evsel *evsel, *warned_leader = NULL; - if (evlist__has_hybrid(evlist)) - evlist__warn_hybrid_group(evlist); - evlist__for_each_entry(evlist, evsel) { struct evsel *leader = evsel__leader(evsel); diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index db3f5fbdebe1..0f59c80f27b2 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -41,38 +41,6 @@ int evlist__add_default_hybrid(struct evlist *evlist, bool precise) return 0; } -static bool group_hybrid_conflict(struct evsel *leader) -{ - struct evsel *pos, *prev = NULL; - - for_each_group_evsel(pos, leader) { - if (!evsel__is_hybrid(pos)) - continue; - - if (prev && strcmp(prev->pmu_name, pos->pmu_name)) - return true; - - prev = pos; - } - - return false; -} - -void evlist__warn_hybrid_group(struct evlist *evlist) -{ - struct evsel *evsel; - - evlist__for_each_entry(evlist, evsel) { - if (evsel__is_group_leader(evsel) && - evsel->core.nr_members > 1 && - group_hybrid_conflict(evsel)) { - pr_warning("WARNING: events in group from " - "different hybrid PMUs!\n"); - return; - } - } -} - bool evlist__has_hybrid(struct evlist *evlist) { struct evsel *evsel; diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h index 19f74b4c340a..4b000eda6626 100644 --- a/tools/perf/util/evlist-hybrid.h +++ b/tools/perf/util/evlist-hybrid.h @@ -8,7 +8,6 @@ #include int evlist__add_default_hybrid(struct evlist *evlist, bool precise); -void evlist__warn_hybrid_group(struct evlist *evlist); bool evlist__has_hybrid(struct evlist *evlist); #endif /* __PERF_EVLIST_HYBRID_H */ -- cgit v1.2.3 From 7b100989b4f6bce7090ef89badf4091b1730d14c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:49 -0700 Subject: perf evlist: Remove __evlist__add_default __evlist__add_default adds a cycles event to a typically empty evlist and was extended for hybrid with evlist__add_default_hybrid, as more than 1 PMU was necessary. Rather than have dedicated logic for the cycles event, this change switches to parsing 'cycles:P' which will handle wildcarding the PMUs appropriately for hybrid. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/evsel.c | 20 ----------------- tools/perf/builtin-record.c | 13 +++--------- tools/perf/builtin-top.c | 10 +++++---- tools/perf/util/evlist-hybrid.c | 25 ---------------------- tools/perf/util/evlist-hybrid.h | 1 - tools/perf/util/evlist.c | 22 +++++++------------ tools/perf/util/evlist.h | 7 ------ tools/perf/util/evsel.c | 46 ---------------------------------------- tools/perf/util/evsel.h | 3 --- tools/perf/util/python.c | 8 +++++++ 10 files changed, 25 insertions(+), 130 deletions(-) diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index ea3972d785d1..153cdca94cd4 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -16,26 +16,6 @@ void arch_evsel__set_sample_weight(struct evsel *evsel) evsel__set_sample_bit(evsel, WEIGHT_STRUCT); } -void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr) -{ - struct perf_env env = { .total_mem = 0, } ; - - if (!perf_env__cpuid(&env)) - return; - - /* - * On AMD, precise cycles event sampling internally uses IBS pmu. - * But IBS does not have filtering capabilities and perf by default - * sets exclude_guest = 1. This makes IBS pmu event init fail and - * thus perf ends up doing non-precise sampling. Avoid it by clearing - * exclude_guest. - */ - if (env.cpuid && strstarts(env.cpuid, "AuthenticAMD")) - attr->exclude_guest = 0; - - free(env.cpuid); -} - /* Check whether the evsel's PMU supports the perf metrics */ bool evsel__sys_has_perf_metrics(const struct evsel *evsel) { diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 88f7b4241153..d80b54a6f450 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -4161,18 +4161,11 @@ int cmd_record(int argc, const char **argv) record.opts.tail_synthesize = true; if (rec->evlist->core.nr_entries == 0) { - if (perf_pmu__has_hybrid()) { - err = evlist__add_default_hybrid(rec->evlist, - !record.opts.no_samples); - } else { - err = __evlist__add_default(rec->evlist, - !record.opts.no_samples); - } + bool can_profile_kernel = perf_event_paranoid_check(1); - if (err < 0) { - pr_err("Not enough memory for event selector list\n"); + err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu"); + if (err) goto out; - } } if (rec->opts.target.tid && !rec->opts.no_inherit_set) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 48ee49e95c5e..27a7f068207d 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1653,10 +1653,12 @@ int cmd_top(int argc, const char **argv) if (annotate_check_args(&top.annotation_opts) < 0) goto out_delete_evlist; - if (!top.evlist->core.nr_entries && - evlist__add_default(top.evlist) < 0) { - pr_err("Not enough memory for event selector list\n"); - goto out_delete_evlist; + if (!top.evlist->core.nr_entries) { + bool can_profile_kernel = perf_event_paranoid_check(1); + int err = parse_event(top.evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu"); + + if (err) + goto out_delete_evlist; } status = evswitch__init(&top.evswitch, top.evlist, stderr); diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index 0f59c80f27b2..64f78d06fe19 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -16,31 +16,6 @@ #include #include -int evlist__add_default_hybrid(struct evlist *evlist, bool precise) -{ - struct evsel *evsel; - struct perf_pmu *pmu; - __u64 config; - struct perf_cpu_map *cpus; - - perf_pmu__for_each_hybrid_pmu(pmu) { - config = PERF_COUNT_HW_CPU_CYCLES | - ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT); - evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE, - config); - if (!evsel) - return -ENOMEM; - - cpus = perf_cpu_map__get(pmu->cpus); - evsel->core.cpus = cpus; - evsel->core.own_cpus = perf_cpu_map__get(cpus); - evsel->pmu_name = strdup(pmu->name); - evlist__add(evlist, evsel); - } - - return 0; -} - bool evlist__has_hybrid(struct evlist *evlist) { struct evsel *evsel; diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h index 4b000eda6626..0cded76eb344 100644 --- a/tools/perf/util/evlist-hybrid.h +++ b/tools/perf/util/evlist-hybrid.h @@ -7,7 +7,6 @@ #include "evlist.h" #include -int evlist__add_default_hybrid(struct evlist *evlist, bool precise); bool evlist__has_hybrid(struct evlist *evlist); #endif /* __PERF_EVLIST_HYBRID_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 9dfa977193b3..63f8821a5395 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -93,8 +93,15 @@ struct evlist *evlist__new(void) struct evlist *evlist__new_default(void) { struct evlist *evlist = evlist__new(); + bool can_profile_kernel; + int err; + + if (!evlist) + return NULL; - if (evlist && evlist__add_default(evlist)) { + can_profile_kernel = perf_event_paranoid_check(1); + err = parse_event(evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu"); + if (err) { evlist__delete(evlist); evlist = NULL; } @@ -237,19 +244,6 @@ static void evlist__set_leader(struct evlist *evlist) perf_evlist__set_leader(&evlist->core); } -int __evlist__add_default(struct evlist *evlist, bool precise) -{ - struct evsel *evsel; - - evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE, - PERF_COUNT_HW_CPU_CYCLES); - if (evsel == NULL) - return -ENOMEM; - - evlist__add(evlist, evsel); - return 0; -} - static struct evsel *evlist__dummy_event(struct evlist *evlist) { struct perf_event_attr attr = { diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 5e7ff44f3043..664c6bf7b3e0 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -100,13 +100,6 @@ void evlist__delete(struct evlist *evlist); void evlist__add(struct evlist *evlist, struct evsel *entry); void evlist__remove(struct evlist *evlist, struct evsel *evsel); -int __evlist__add_default(struct evlist *evlist, bool precise); - -static inline int evlist__add_default(struct evlist *evlist) -{ - return __evlist__add_default(evlist, true); -} - int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs); int __evlist__add_default_attrs(struct evlist *evlist, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 8c8f371ea2b5..1df8f967d2eb 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -316,48 +316,6 @@ struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx) return evsel; } -static bool perf_event_can_profile_kernel(void) -{ - return perf_event_paranoid_check(1); -} - -struct evsel *evsel__new_cycles(bool precise __maybe_unused, __u32 type, __u64 config) -{ - struct perf_event_attr attr = { - .type = type, - .config = config, - .exclude_kernel = !perf_event_can_profile_kernel(), - }; - struct evsel *evsel; - - event_attr_init(&attr); - - /* - * Now let the usual logic to set up the perf_event_attr defaults - * to kick in when we return and before perf_evsel__open() is called. - */ - evsel = evsel__new(&attr); - if (evsel == NULL) - goto out; - - arch_evsel__fixup_new_cycles(&evsel->core.attr); - - evsel->precise_max = true; - - /* use asprintf() because free(evsel) assumes name is allocated */ - if (asprintf(&evsel->name, "cycles%s%s%.*s", - (attr.precise_ip || attr.exclude_kernel) ? ":" : "", - attr.exclude_kernel ? "u" : "", - attr.precise_ip ? attr.precise_ip + 1 : 0, "ppp") < 0) - goto error_free; -out: - return evsel; -error_free: - evsel__delete(evsel); - evsel = NULL; - goto out; -} - int copy_config_terms(struct list_head *dst, struct list_head *src) { struct evsel_config_term *pos, *tmp; @@ -1131,10 +1089,6 @@ void __weak arch_evsel__set_sample_weight(struct evsel *evsel) evsel__set_sample_bit(evsel, WEIGHT); } -void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_unused) -{ -} - void __weak arch__post_evsel_config(struct evsel *evsel __maybe_unused, struct perf_event_attr *attr __maybe_unused) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index df8928745fc6..429b172cc94d 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -243,8 +243,6 @@ static inline struct evsel *evsel__newtp(const char *sys, const char *name) } #endif -struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config); - #ifdef HAVE_LIBTRACEEVENT struct tep_event *event_format__new(const char *sys, const char *name); #endif @@ -312,7 +310,6 @@ void __evsel__reset_sample_bit(struct evsel *evsel, enum perf_event_sample_forma void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier); void arch_evsel__set_sample_weight(struct evsel *evsel); -void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr); void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr); int evsel__set_filter(struct evsel *evsel, const char *filter); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 0faea4c75eed..3c1f4c979c9e 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -49,6 +49,14 @@ #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) #endif +/* + * Avoid bringing in event parsing. + */ +int parse_event(struct evlist *evlist __maybe_unused, const char *str __maybe_unused) +{ + return 0; +} + /* * Provide these two so that we don't have to link against callchain.c and * start dragging hist.c, etc. -- cgit v1.2.3 From b167b530eb83dfd791061e1d312236bffde772a4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:50 -0700 Subject: perf evlist: Reduce scope of evlist__has_hybrid Function is only used in printout, reduce scope to stat-display.c. Remove the now empty evlist-hybrid.c and evlist-hybrid.h. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 1 - tools/perf/builtin-stat.c | 1 - tools/perf/util/Build | 1 - tools/perf/util/evlist-hybrid.c | 31 ------------------------------- tools/perf/util/evlist-hybrid.h | 12 ------------ tools/perf/util/evlist.c | 1 - tools/perf/util/stat-display.c | 15 ++++++++++++++- 7 files changed, 14 insertions(+), 48 deletions(-) delete mode 100644 tools/perf/util/evlist-hybrid.c delete mode 100644 tools/perf/util/evlist-hybrid.h diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d80b54a6f450..e30e8d6a6575 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -50,7 +50,6 @@ #include "util/pfm.h" #include "util/clockid.h" #include "util/pmu-hybrid.h" -#include "util/evlist-hybrid.h" #include "util/off_cpu.h" #include "util/bpf-filter.h" #include "asm/bug.h" diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d414ee30dcf9..62bbeea93bf3 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -48,7 +48,6 @@ #include "util/pmu.h" #include "util/event.h" #include "util/evlist.h" -#include "util/evlist-hybrid.h" #include "util/evsel.h" #include "util/debug.h" #include "util/color.h" diff --git a/tools/perf/util/Build b/tools/perf/util/Build index c146736ead19..21e4cdcba504 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -11,7 +11,6 @@ perf-y += db-export.o perf-y += env.o perf-y += event.o perf-y += evlist.o -perf-y += evlist-hybrid.o perf-y += sideband_evlist.o perf-y += evsel.o perf-y += evsel_fprintf.o diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c deleted file mode 100644 index 64f78d06fe19..000000000000 --- a/tools/perf/util/evlist-hybrid.c +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include "cpumap.h" -#include "evlist.h" -#include "evsel.h" -#include "../perf.h" -#include "util/pmu-hybrid.h" -#include "util/evlist-hybrid.h" -#include "debug.h" -#include -#include -#include -#include -#include -#include -#include - -bool evlist__has_hybrid(struct evlist *evlist) -{ - struct evsel *evsel; - - evlist__for_each_entry(evlist, evsel) { - if (evsel->pmu_name && - perf_pmu__is_hybrid(evsel->pmu_name)) { - return true; - } - } - - return false; -} diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h deleted file mode 100644 index 0cded76eb344..000000000000 --- a/tools/perf/util/evlist-hybrid.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PERF_EVLIST_HYBRID_H -#define __PERF_EVLIST_HYBRID_H - -#include -#include -#include "evlist.h" -#include - -bool evlist__has_hybrid(struct evlist *evlist); - -#endif /* __PERF_EVLIST_HYBRID_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 63f8821a5395..82c0b3d0c822 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -28,7 +28,6 @@ #include "util/string2.h" #include "util/perf_api_probe.h" #include "util/evsel_fprintf.h" -#include "util/evlist-hybrid.h" #include "util/pmu.h" #include "util/sample.h" #include "util/bpf-filter.h" diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 319f456f0673..4cce7d3c5e52 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -20,7 +20,6 @@ #include "util.h" #include "iostat.h" #include "pmu-hybrid.h" -#include "evlist-hybrid.h" #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" @@ -692,6 +691,20 @@ static bool is_mixed_hw_group(struct evsel *counter) return false; } +static bool evlist__has_hybrid(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->pmu_name && + perf_pmu__is_hybrid(evsel->pmu_name)) { + return true; + } + } + + return false; +} + static void printout(struct perf_stat_config *config, struct outstate *os, double uval, u64 run, u64 ena, double noise, int aggr_idx) { -- cgit v1.2.3 From 4ced2c246e2c9c90a7ea96f4bcd31a0b696b8dd6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:51 -0700 Subject: perf pmu: Remove perf_pmu__hybrid_mounted perf_pmu__hybrid_mounted is used to detect whether cpu_core or cpu_atom is mounted with a non-empty cpus file by pmu_lookup. Discussion [1] showed the empty cpus file check to be redundant and so pmu_lookup needn't have a call to perf_pmu__hybrid_mounted. Checking hybrid_mounted in pmu_is_uncore is redundant as the next cpumask read will fail returning false. Reduce the scope of perf_pmu__find_hybrid_pmu by making it static. [1] https://lore.kernel.org/lkml/20230524221831.1741381-17-irogers@google.com/ Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu-hybrid.c | 15 +-------------- tools/perf/util/pmu-hybrid.h | 3 --- tools/perf/util/pmu.c | 13 +------------ 3 files changed, 2 insertions(+), 29 deletions(-) diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c index bc4cb0738c35..7fe943dd3217 100644 --- a/tools/perf/util/pmu-hybrid.c +++ b/tools/perf/util/pmu-hybrid.c @@ -18,20 +18,7 @@ LIST_HEAD(perf_pmu__hybrid_pmus); -bool perf_pmu__hybrid_mounted(const char *name) -{ - int cpu; - char pmu_name[PATH_MAX]; - struct perf_pmu pmu = {.name = pmu_name}; - - if (strncmp(name, "cpu_", 4)) - return false; - - strlcpy(pmu_name, name, sizeof(pmu_name)); - return perf_pmu__scan_file(&pmu, "cpus", "%u", &cpu) > 0; -} - -struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name) +static struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name) { struct perf_pmu *pmu; diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h index 206b94931531..8dbcae935020 100644 --- a/tools/perf/util/pmu-hybrid.h +++ b/tools/perf/util/pmu-hybrid.h @@ -13,9 +13,6 @@ extern struct list_head perf_pmu__hybrid_pmus; #define perf_pmu__for_each_hybrid_pmu(pmu) \ list_for_each_entry(pmu, &perf_pmu__hybrid_pmus, hybrid_list) -bool perf_pmu__hybrid_mounted(const char *name); - -struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name); bool perf_pmu__is_hybrid(const char *name); static inline int perf_pmu__hybrid_pmu_num(void) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index cd94abe7a87a..83c7eeb8abea 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -617,9 +617,6 @@ static bool pmu_is_uncore(int dirfd, const char *name) { int fd; - if (perf_pmu__hybrid_mounted(name)) - return false; - fd = perf_pmu__pathname_fd(dirfd, name, "cpumask", O_PATH); if (fd < 0) return false; @@ -907,15 +904,8 @@ static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) LIST_HEAD(aliases); __u32 type; char *name = pmu_find_real_name(lookup_name); - bool is_hybrid = perf_pmu__hybrid_mounted(name); char *alias_name; - /* - * Check pmu name for hybrid and the pmu may be invalid in sysfs - */ - if (!strncmp(name, "cpu_", 4) && !is_hybrid) - return NULL; - /* * The pmu data we store & need consists of the pmu * type value and format definitions. Load both right @@ -936,7 +926,6 @@ static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) pmu->cpus = pmu_cpumask(dirfd, name); pmu->name = strdup(name); - if (!pmu->name) goto err; @@ -967,7 +956,7 @@ static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); - if (is_hybrid) + if (!strcmp(name, "cpu_core") || !strcmp(name, "cpu_atom")) list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus); else INIT_LIST_HEAD(&pmu->hybrid_list); -- cgit v1.2.3 From ab1a1c77a38ad1efea4396f271ccde53b58c1b8e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:52 -0700 Subject: perf pmu: Rewrite perf_pmu__has_hybrid to avoid list Rather than list empty on perf_pmu__hybrid_pmus, detect if any core PMUs match the hybrid name. Computed values held in statics to avoid recomputation. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 83c7eeb8abea..5a7bfbf621d0 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -60,8 +60,6 @@ struct perf_pmu_format { struct list_head list; }; -static bool hybrid_scanned; - static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name); /* @@ -2013,12 +2011,20 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, bool perf_pmu__has_hybrid(void) { + static bool hybrid_scanned, has_hybrid; + if (!hybrid_scanned) { + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (pmu->is_core && is_pmu_hybrid(pmu->name)) { + has_hybrid = true; + break; + } + } hybrid_scanned = true; - perf_pmu__scan(NULL); } - - return !list_empty(&perf_pmu__hybrid_pmus); + return has_hybrid; } int perf_pmu__match(char *pattern, char *name, char *tok) -- cgit v1.2.3 From dd64647ecbba7572e41489c9bc54980aeb434bc2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:53 -0700 Subject: perf x86: Iterate hybrid PMUs as core PMUs Rather than iterating over a separate hybrid list, iterate all PMUs with the hybrid ones having is_core as true. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/hybrid.c | 2 +- tools/perf/arch/x86/util/evlist.c | 25 +++++++++++++++++-------- tools/perf/arch/x86/util/perf_regs.c | 14 ++++++++++---- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index 941a9edfed4e..944bd1b4bab6 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -3,7 +3,7 @@ #include "debug.h" #include "evlist.h" #include "evsel.h" -#include "pmu-hybrid.h" +#include "pmu.h" #include "tests/tests.h" static bool test_config(const struct evsel *evsel, __u64 expected_config) diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c index 1b6065841fb0..03f7eb4cf0a4 100644 --- a/tools/perf/arch/x86/util/evlist.c +++ b/tools/perf/arch/x86/util/evlist.c @@ -4,7 +4,6 @@ #include "util/evlist.h" #include "util/parse-events.h" #include "util/event.h" -#include "util/pmu-hybrid.h" #include "topdown.h" #include "evsel.h" @@ -12,9 +11,6 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs) { - struct perf_cpu_map *cpus; - struct evsel *evsel, *n; - struct perf_pmu *pmu; LIST_HEAD(head); size_t i = 0; @@ -25,15 +21,24 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, return evlist__add_attrs(evlist, attrs, nr_attrs); for (i = 0; i < nr_attrs; i++) { + struct perf_pmu *pmu = NULL; + if (attrs[i].type == PERF_TYPE_SOFTWARE) { - evsel = evsel__new(attrs + i); + struct evsel *evsel = evsel__new(attrs + i); + if (evsel == NULL) goto out_delete_partial_list; list_add_tail(&evsel->core.node, &head); continue; } - perf_pmu__for_each_hybrid_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + struct perf_cpu_map *cpus; + struct evsel *evsel; + + if (!pmu->is_core) + continue; + evsel = evsel__new(attrs + i); if (evsel == NULL) goto out_delete_partial_list; @@ -51,8 +56,12 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, return 0; out_delete_partial_list: - __evlist__for_each_entry_safe(&head, n, evsel) - evsel__delete(evsel); + { + struct evsel *evsel, *n; + + __evlist__for_each_entry_safe(&head, n, evsel) + evsel__delete(evsel); + } return -1; } diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index 0ed177991ad0..26abc159fc0e 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -10,7 +10,6 @@ #include "../../../util/debug.h" #include "../../../util/event.h" #include "../../../util/pmu.h" -#include "../../../util/pmu-hybrid.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG(AX, PERF_REG_X86_AX), @@ -286,7 +285,6 @@ uint64_t arch__intr_reg_mask(void) .disabled = 1, .exclude_kernel = 1, }; - struct perf_pmu *pmu; int fd; /* * In an unnamed union, init it here to build on older gcc versions @@ -294,12 +292,20 @@ uint64_t arch__intr_reg_mask(void) attr.sample_period = 1; if (perf_pmu__has_hybrid()) { + struct perf_pmu *pmu = NULL; + __u64 type = PERF_TYPE_RAW; + /* * The same register set is supported among different hybrid PMUs. * Only check the first available one. */ - pmu = list_first_entry(&perf_pmu__hybrid_pmus, typeof(*pmu), hybrid_list); - attr.config |= (__u64)pmu->type << PERF_PMU_TYPE_SHIFT; + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (pmu->is_core) { + type = pmu->type; + break; + } + } + attr.config |= type << PERF_PMU_TYPE_SHIFT; } event_attr_init(&attr); -- cgit v1.2.3 From 1215795cebb24578afd378b23d206014327558c4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:54 -0700 Subject: perf topology: Avoid hybrid list for hybrid topology Avoid perf_pmu__for_each_hybrid_pmu in hybrid_topology__new by scanning all PMUs and processing the is_core ones. Add early exit for non-hybrid. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cputopo.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index ca1d833a0c26..a5c259bd5cc0 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -12,7 +12,7 @@ #include "cpumap.h" #include "debug.h" #include "env.h" -#include "pmu-hybrid.h" +#include "pmu.h" #define PACKAGE_CPUS_FMT \ "%s/devices/system/cpu/cpu%d/topology/package_cpus_list" @@ -469,11 +469,17 @@ err: struct hybrid_topology *hybrid_topology__new(void) { - struct perf_pmu *pmu; + struct perf_pmu *pmu = NULL; struct hybrid_topology *tp = NULL; - u32 nr, i = 0; + u32 nr = 0, i = 0; - nr = perf_pmu__hybrid_pmu_num(); + if (!perf_pmu__has_hybrid()) + return NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (pmu->is_core) + nr++; + } if (nr == 0) return NULL; @@ -482,7 +488,10 @@ struct hybrid_topology *hybrid_topology__new(void) return NULL; tp->nr = nr; - perf_pmu__for_each_hybrid_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (!pmu->is_core) + continue; + if (load_hybrid_node(&tp->nodes[i], pmu)) { hybrid_topology__delete(tp); return NULL; -- cgit v1.2.3 From 5d9fb6667642ce1e382afd37184ec6acf1bb7626 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:55 -0700 Subject: perf evsel: Compute is_hybrid from PMU being core Short-cut when has_hybrid is false, otherwise return if the evsel's PMU is core. Add a comment for the some what surprising no PMU cases of hardware and legacy cache events. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-20-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 12 ++++++++++-- tools/perf/util/python.c | 5 +++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1df8f967d2eb..1c6e22e3f345 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -46,8 +46,8 @@ #include "memswap.h" #include "util.h" #include "util/hashmap.h" -#include "pmu-hybrid.h" #include "off_cpu.h" +#include "pmu.h" #include "../perf-sys.h" #include "util/parse-branch-options.h" #include "util/bpf-filter.h" @@ -3132,9 +3132,17 @@ void evsel__zero_per_pkg(struct evsel *evsel) } } +/** + * evsel__is_hybrid - does the evsel have a known PMU that is hybrid. Note, this + * will be false on hybrid systems for hardware and legacy + * cache events. + */ bool evsel__is_hybrid(const struct evsel *evsel) { - return evsel->pmu_name && perf_pmu__is_hybrid(evsel->pmu_name); + if (!perf_pmu__has_hybrid()) + return false; + + return evsel->core.is_pmu_core; } struct evsel *evsel__leader(const struct evsel *evsel) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 3c1f4c979c9e..b27b27086422 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -102,6 +102,11 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, return EOF; } +bool perf_pmu__has_hybrid(void) +{ + return false; +} + bool evsel__is_aux_event(const struct evsel *evsel __maybe_unused) { return false; -- cgit v1.2.3 From 178ddf3bad981380ad284ba1d70013cf1fdef981 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:56 -0700 Subject: perf header: Avoid hybrid PMU list in write_pmu_caps Avoid perf_pmu__for_each_hybrid_pmu by iterating all PMUs are dumping the core ones. This will eventually allow removal of the hybrid PMU list. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-21-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 560871736764..37fa66b1ca77 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -51,7 +51,6 @@ #include "bpf-event.h" #include "bpf-utils.h" #include "clockid.h" -#include "pmu-hybrid.h" #include #include @@ -1605,17 +1604,21 @@ static int write_pmu_caps(struct feat_fd *ff, * Write hybrid pmu caps first to maintain compatibility with * older perf tool. */ - pmu = NULL; - perf_pmu__for_each_hybrid_pmu(pmu) { - ret = __write_pmu_caps(ff, pmu, true); - if (ret < 0) - return ret; + if (perf_pmu__has_hybrid()) { + pmu = NULL; + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->is_core) + continue; + + ret = __write_pmu_caps(ff, pmu, true); + if (ret < 0) + return ret; + } } pmu = NULL; while ((pmu = perf_pmu__scan(pmu))) { - if (!pmu->name || !strcmp(pmu->name, "cpu") || - !pmu->nr_caps || perf_pmu__is_hybrid(pmu->name)) + if (pmu->is_core || !pmu->nr_caps) continue; ret = __write_pmu_caps(ff, pmu, true); -- cgit v1.2.3 From ec6a4a8bd3a554674eaa4ac3f423e1a5347427ee Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:57 -0700 Subject: perf metrics: Remove perf_pmu__is_hybrid use Switch from perf_pmu__is_hybrid to avoid implicitly using the hybrid PMU list. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 80ffd6da70c7..3f04a686d1cd 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -11,7 +11,6 @@ #include "evsel.h" #include "strbuf.h" #include "pmu.h" -#include "pmu-hybrid.h" #include "print-events.h" #include "smt.h" #include "expr.h" @@ -274,7 +273,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, const char *metric_id; struct evsel *ev; size_t ids_size, matched_events, i; - bool all_pmus = !strcmp(pmu, "all") || !perf_pmu__is_hybrid(pmu); + bool all_pmus = !strcmp(pmu, "all") || !perf_pmu__has_hybrid() || !is_pmu_hybrid(pmu); *out_metric_events = NULL; ids_size = hashmap__size(ids); @@ -288,8 +287,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, struct expr_id_data *val_ptr; /* Don't match events for the wrong hybrid PMU. */ - if (!all_pmus && ev->pmu_name && - perf_pmu__is_hybrid(ev->pmu_name) && + if (!all_pmus && ev->pmu_name && evsel__is_hybrid(ev) && strcmp(ev->pmu_name, pmu)) continue; /* -- cgit v1.2.3 From 3d88055f081056be6448a2628ad815d88d7ed570 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:58 -0700 Subject: perf stat: Avoid hybrid PMU list perf_pmu__is_hybrid implicitly uses the hybrid PMU list. Instead return false if hybrid isn't present, if it is then see if any evsel's PMUs are core. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-23-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 4cce7d3c5e52..a3e184e0b5ba 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -19,7 +19,7 @@ #include #include "util.h" #include "iostat.h" -#include "pmu-hybrid.h" +#include "pmu.h" #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" @@ -695,11 +695,12 @@ static bool evlist__has_hybrid(struct evlist *evlist) { struct evsel *evsel; + if (!perf_pmu__has_hybrid()) + return false; + evlist__for_each_entry(evlist, evsel) { - if (evsel->pmu_name && - perf_pmu__is_hybrid(evsel->pmu_name)) { + if (evsel->core.is_pmu_core) return true; - } } return false; -- cgit v1.2.3 From abe9544ea78a2e0c3cc92b4410a57a9c0732293f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:59 -0700 Subject: perf mem: Avoid hybrid PMU list Add perf_pmu__num_mem_pmus that scans/counts the number of PMUs for mem events. Switch perf_pmu__for_each_hybrid_pmu to iterating all PMUs and only handling is_core ones. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-24-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 7 ++----- tools/perf/builtin-mem.c | 7 ++----- tools/perf/util/mem-events.c | 20 ++++++++++++++------ tools/perf/util/pmu.c | 17 +++++++++++++++++ tools/perf/util/pmu.h | 1 + 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 08455e26b606..2757ccc19c5e 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -42,7 +42,6 @@ #include "ui/ui.h" #include "ui/progress.h" #include "pmu.h" -#include "pmu-hybrid.h" #include "string2.h" #include "util/util.h" @@ -3259,10 +3258,8 @@ static int perf_c2c__record(int argc, const char **argv) argc = parse_options(argc, argv, options, record_mem_usage, PARSE_OPT_KEEP_UNKNOWN); - if (!perf_pmu__has_hybrid()) - rec_argc = argc + 11; /* max number of arguments */ - else - rec_argc = argc + 11 * perf_pmu__hybrid_pmu_num(); + /* Max number of arguments multiplied by number of PMUs that can support them. */ + rec_argc = argc + 11 * perf_pmu__num_mem_pmus(); rec_argv = calloc(rec_argc + 1, sizeof(char *)); if (!rec_argv) diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 65465930ef8e..f4f1ff76d49d 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -18,7 +18,6 @@ #include "util/map.h" #include "util/symbol.h" #include "util/pmu.h" -#include "util/pmu-hybrid.h" #include "util/sample.h" #include "util/string2.h" #include "util/util.h" @@ -93,10 +92,8 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) argc = parse_options(argc, argv, options, record_mem_usage, PARSE_OPT_KEEP_UNKNOWN); - if (!perf_pmu__has_hybrid()) - rec_argc = argc + 9; /* max number of arguments */ - else - rec_argc = argc + 9 * perf_pmu__hybrid_pmu_num(); + /* Max number of arguments multiplied by number of PMUs that can support them. */ + rec_argc = argc + 9 * perf_pmu__num_mem_pmus(); if (mem->cpu_list) rec_argc += 2; diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index ed1ee4b05356..c9e422a38258 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -13,7 +13,6 @@ #include "debug.h" #include "symbol.h" #include "pmu.h" -#include "pmu-hybrid.h" unsigned int perf_mem_events__loads_ldlat = 30; @@ -120,7 +119,6 @@ int perf_mem_events__init(void) for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { struct perf_mem_event *e = perf_mem_events__ptr(j); - struct perf_pmu *pmu; char sysfs_name[100]; /* @@ -135,7 +133,12 @@ int perf_mem_events__init(void) e->sysfs_name, "cpu"); e->supported = perf_mem_event__supported(mnt, sysfs_name); } else { - perf_pmu__for_each_hybrid_pmu(pmu) { + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (!pmu->is_core) + continue; + scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); e->supported |= perf_mem_event__supported(mnt, sysfs_name); @@ -170,9 +173,12 @@ static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, { const char *mnt = sysfs__mount(); char sysfs_name[100]; - struct perf_pmu *pmu; + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (!pmu->is_core) + continue; - perf_pmu__for_each_hybrid_pmu(pmu) { scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); if (!perf_mem_event__supported(mnt, sysfs_name)) { @@ -210,7 +216,9 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, return -1; } - perf_pmu__for_each_hybrid_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (!pmu->is_core) + continue; rec_argv[i++] = "-e"; s = perf_mem_events__name(j, pmu->name); if (s) { diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 5a7bfbf621d0..65daa0cc71d6 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1660,6 +1660,23 @@ bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) return !is_pmu_hybrid(pmu->name); } +static bool perf_pmu__is_mem_pmu(const struct perf_pmu *pmu) +{ + return pmu->is_core; +} + +int perf_pmu__num_mem_pmus(void) +{ + struct perf_pmu *pmu = NULL; + int count = 0; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (perf_pmu__is_mem_pmu(pmu)) + count++; + } + return count; +} + static bool pmu_alias_is_duplicate(struct sevent *alias_a, struct sevent *alias_b) { diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index af10d137e2b5..5f5de7c20ab6 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -231,6 +231,7 @@ bool is_pmu_hybrid(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu); bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); +int perf_pmu__num_mem_pmus(void); void print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool pmu_have_event(const char *pname, const char *name); -- cgit v1.2.3 From 597a4276fb326163b90754ef7b2a550a6b2b4054 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:00 -0700 Subject: perf pmu: Remove perf_pmu__hybrid_pmus list Rather than iterate hybrid PMUs, inhererently Intel specific, iterate all PMUs checking whether they are core. To only get hybrid cores, first call perf_pmu__has_hybrid. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-25-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 1 - tools/perf/util/Build | 1 - tools/perf/util/pmu-hybrid.c | 39 -------------------------------------- tools/perf/util/pmu-hybrid.h | 29 ---------------------------- tools/perf/util/pmu.c | 7 ------- tools/perf/util/pmu.h | 2 -- tools/perf/util/print-events.c | 1 - tools/perf/util/python-ext-sources | 1 - 8 files changed, 81 deletions(-) delete mode 100644 tools/perf/util/pmu-hybrid.c delete mode 100644 tools/perf/util/pmu-hybrid.h diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e30e8d6a6575..2abcad2998f6 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -49,7 +49,6 @@ #include "util/util.h" #include "util/pfm.h" #include "util/clockid.h" -#include "util/pmu-hybrid.h" #include "util/off_cpu.h" #include "util/bpf-filter.h" #include "asm/bug.h" diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 21e4cdcba504..0d68be51a739 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -73,7 +73,6 @@ perf-y += pmu.o perf-y += pmus.o perf-y += pmu-flex.o perf-y += pmu-bison.o -perf-y += pmu-hybrid.o perf-y += svghelper.o perf-$(CONFIG_LIBTRACEEVENT) += trace-event-info.o perf-y += trace-event-scripting.o diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c deleted file mode 100644 index 7fe943dd3217..000000000000 --- a/tools/perf/util/pmu-hybrid.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "fncache.h" -#include "pmu-hybrid.h" - -LIST_HEAD(perf_pmu__hybrid_pmus); - -static struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name) -{ - struct perf_pmu *pmu; - - if (!name) - return NULL; - - perf_pmu__for_each_hybrid_pmu(pmu) { - if (!strcmp(name, pmu->name)) - return pmu; - } - - return NULL; -} - -bool perf_pmu__is_hybrid(const char *name) -{ - return perf_pmu__find_hybrid_pmu(name) != NULL; -} diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h deleted file mode 100644 index 8dbcae935020..000000000000 --- a/tools/perf/util/pmu-hybrid.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PMU_HYBRID_H -#define __PMU_HYBRID_H - -#include -#include -#include -#include -#include "pmu.h" - -extern struct list_head perf_pmu__hybrid_pmus; - -#define perf_pmu__for_each_hybrid_pmu(pmu) \ - list_for_each_entry(pmu, &perf_pmu__hybrid_pmus, hybrid_list) - -bool perf_pmu__is_hybrid(const char *name); - -static inline int perf_pmu__hybrid_pmu_num(void) -{ - struct perf_pmu *pmu; - int num = 0; - - perf_pmu__for_each_hybrid_pmu(pmu) - num++; - - return num; -} - -#endif /* __PMU_HYBRID_H */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 65daa0cc71d6..21ee23b78f5a 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -32,7 +32,6 @@ #include "string2.h" #include "strbuf.h" #include "fncache.h" -#include "pmu-hybrid.h" #include "util/evsel_config.h" struct perf_pmu perf_pmu__fake; @@ -954,11 +953,6 @@ static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); - if (!strcmp(name, "cpu_core") || !strcmp(name, "cpu_atom")) - list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus); - else - INIT_LIST_HEAD(&pmu->hybrid_list); - pmu->default_config = perf_pmu__get_default_config(pmu); return pmu; @@ -2131,7 +2125,6 @@ void perf_pmu__destroy(void) list_for_each_entry_safe(pmu, tmp, &pmus, list) { list_del(&pmu->list); - list_del(&pmu->hybrid_list); perf_pmu__delete(pmu); } diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 5f5de7c20ab6..cb51ad6e40fa 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -119,8 +119,6 @@ struct perf_pmu { struct list_head caps; /** @list: Element on pmus list in pmu.c. */ struct list_head list; - /** @hybrid_list: Element on perf_pmu__hybrid_pmus. */ - struct list_head hybrid_list; /** * @missing_features: Features to inhibit when events on this PMU are diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 69492cbd6921..8d823bc906e6 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -26,7 +26,6 @@ #include "strlist.h" #include "tracepoint.h" #include "pfm.h" -#include "pmu-hybrid.h" #include "thread_map.h" #define MAX_NAME_LEN 100 diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index aa5156c2bcff..d4c9b4cd35ef 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -39,5 +39,4 @@ util/affinity.c util/rwsem.c util/hashmap.c util/perf_regs.c -util/pmu-hybrid.c util/fncache.c -- cgit v1.2.3 From f24ebe8053514936d4e8cffb707af3a275fa32e5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:01 -0700 Subject: perf pmus: Prefer perf_pmu__scan over perf_pmus__for_each_pmu perf_pmus__for_each_pmu doesn't lazily initialize pmus making its use error prone. Just use perf_pmu__scan as this only impacts non-performance critical tests. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-26-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/pmu-scan.c | 6 ++---- tools/perf/tests/event_groups.c | 7 ++----- tools/perf/tests/parse-events.c | 11 ++++------- tools/perf/util/pmus.h | 2 -- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/tools/perf/bench/pmu-scan.c b/tools/perf/bench/pmu-scan.c index f0f007843bb8..f4a6c37cbe27 100644 --- a/tools/perf/bench/pmu-scan.c +++ b/tools/perf/bench/pmu-scan.c @@ -40,13 +40,11 @@ static struct pmu_scan_result *results; static int save_result(void) { - struct perf_pmu *pmu; + struct perf_pmu *pmu = NULL; struct list_head *list; struct pmu_scan_result *r; - perf_pmu__scan(NULL); - - perf_pmus__for_each_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { r = realloc(results, (nr_pmus + 1) * sizeof(*r)); if (r == NULL) return -ENOMEM; diff --git a/tools/perf/tests/event_groups.c b/tools/perf/tests/event_groups.c index 029442b4e9c6..3d9a2b524bba 100644 --- a/tools/perf/tests/event_groups.c +++ b/tools/perf/tests/event_groups.c @@ -50,13 +50,10 @@ static int event_open(int type, unsigned long config, int group_fd) static int setup_uncore_event(void) { - struct perf_pmu *pmu; + struct perf_pmu *pmu = NULL; int i, fd; - if (list_empty(&pmus)) - perf_pmu__scan(NULL); - - perf_pmus__for_each_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { for (i = 0; i < NR_UNCORE_PMUS; i++) { if (!strcmp(uncore_pmus[i].name, pmu->name)) { pr_debug("Using %s for uncore pmu event\n", pmu->name); diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 72a10bed84fd..277607ede060 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -108,11 +108,11 @@ static int test__checkevent_raw(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); perf_evlist__for_each_evsel(&evlist->core, evsel) { - struct perf_pmu *pmu; + struct perf_pmu *pmu = NULL; bool type_matched = false; TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, 0x1a)); - perf_pmus__for_each_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { if (pmu->type == evsel->attr.type) { TEST_ASSERT_VAL("PMU type expected once", !type_matched); type_matched = true; @@ -2243,13 +2243,10 @@ static int test__terms2(struct test_suite *test __maybe_unused, int subtest __ma static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - struct perf_pmu *pmu; + struct perf_pmu *pmu = NULL; int ret = TEST_OK; - if (list_empty(&pmus)) - perf_pmu__scan(NULL); - - perf_pmus__for_each_pmu(pmu) { + while ((pmu = perf_pmu__scan(pmu)) != NULL) { struct stat st; char path[PATH_MAX]; struct dirent *ent; diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index d475e2960c10..257de10788e8 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -5,8 +5,6 @@ extern struct list_head pmus; struct perf_pmu; -#define perf_pmus__for_each_pmu(pmu) list_for_each_entry(pmu, &pmus, list) - const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); #endif /* __PMUS_H */ -- cgit v1.2.3 From 875375ea91d8044baddcb62d8333b58f687de444 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:02 -0700 Subject: perf x86 mem: minor refactor to is_mem_loads_aux_event Find the PMU and then the event off of it. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-27-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/mem-events.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c index f683ac702247..02d65e446f46 100644 --- a/tools/perf/arch/x86/util/mem-events.c +++ b/tools/perf/arch/x86/util/mem-events.c @@ -55,13 +55,13 @@ struct perf_mem_event *perf_mem_events__ptr(int i) bool is_mem_loads_aux_event(struct evsel *leader) { - if (perf_pmu__find("cpu")) { - if (!pmu_have_event("cpu", "mem-loads-aux")) - return false; - } else if (perf_pmu__find("cpu_core")) { - if (!pmu_have_event("cpu_core", "mem-loads-aux")) - return false; - } + struct perf_pmu *pmu = perf_pmu__find("cpu"); + + if (!pmu) + pmu = perf_pmu__find("cpu_core"); + + if (pmu && !pmu_have_event(pmu->name, "mem-loads-aux")) + return false; return leader->core.attr.config == MEM_LOADS_AUX; } -- cgit v1.2.3 From 1eaf496ed386934f1c2439a120fe84a05194f91a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:03 -0700 Subject: perf pmu: Separate pmu and pmus Separate and hide the pmus list in pmus.[ch]. Move pmus functionality out of pmu.[ch] into pmus.[ch] renaming pmus functions which were prefixed perf_pmu__ to perf_pmus__. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-28-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/auxtrace.c | 7 +- tools/perf/arch/arm/util/cs-etm.c | 4 +- tools/perf/arch/arm64/util/pmu.c | 3 +- tools/perf/arch/x86/tests/hybrid.c | 5 +- tools/perf/arch/x86/util/auxtrace.c | 5 +- tools/perf/arch/x86/util/evlist.c | 5 +- tools/perf/arch/x86/util/evsel.c | 7 +- tools/perf/arch/x86/util/intel-bts.c | 4 +- tools/perf/arch/x86/util/intel-pt.c | 4 +- tools/perf/arch/x86/util/mem-events.c | 9 +- tools/perf/arch/x86/util/perf_regs.c | 5 +- tools/perf/arch/x86/util/topdown.c | 5 +- tools/perf/bench/pmu-scan.c | 10 +- tools/perf/builtin-c2c.c | 4 +- tools/perf/builtin-list.c | 4 +- tools/perf/builtin-mem.c | 4 +- tools/perf/builtin-record.c | 6 +- tools/perf/builtin-stat.c | 4 +- tools/perf/tests/attr.c | 4 +- tools/perf/tests/event_groups.c | 2 +- tools/perf/tests/parse-events.c | 8 +- tools/perf/tests/parse-metric.c | 4 +- tools/perf/tests/pmu-events.c | 3 +- tools/perf/tests/switch-tracking.c | 4 +- tools/perf/tests/topology.c | 4 +- tools/perf/util/cputopo.c | 7 +- tools/perf/util/env.c | 5 +- tools/perf/util/evsel.c | 3 +- tools/perf/util/header.c | 15 +- tools/perf/util/mem-events.c | 11 +- tools/perf/util/metricgroup.c | 5 +- tools/perf/util/parse-events.c | 15 +- tools/perf/util/parse-events.y | 3 +- tools/perf/util/pfm.c | 6 +- tools/perf/util/pmu.c | 411 +--------------------------------- tools/perf/util/pmu.h | 13 +- tools/perf/util/pmus.c | 396 +++++++++++++++++++++++++++++++- tools/perf/util/pmus.h | 14 +- tools/perf/util/print-events.c | 5 +- tools/perf/util/python.c | 3 +- tools/perf/util/stat-display.c | 3 +- 41 files changed, 533 insertions(+), 506 deletions(-) diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index adec6c9ee11d..3b8eca0ffb17 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -14,6 +14,7 @@ #include "../../../util/debug.h" #include "../../../util/evlist.h" #include "../../../util/pmu.h" +#include "../../../util/pmus.h" #include "cs-etm.h" #include "arm-spe.h" #include "hisi-ptt.h" @@ -40,7 +41,7 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) return NULL; } - arm_spe_pmus[*nr_spes] = perf_pmu__find(arm_spe_pmu_name); + arm_spe_pmus[*nr_spes] = perf_pmus__find(arm_spe_pmu_name); if (arm_spe_pmus[*nr_spes]) { pr_debug2("%s %d: arm_spe_pmu %d type %d name %s\n", __func__, __LINE__, *nr_spes, @@ -87,7 +88,7 @@ static struct perf_pmu **find_all_hisi_ptt_pmus(int *nr_ptts, int *err) rewinddir(dir); while ((dent = readdir(dir))) { if (strstr(dent->d_name, HISI_PTT_PMU_NAME) && idx < *nr_ptts) { - hisi_ptt_pmus[idx] = perf_pmu__find(dent->d_name); + hisi_ptt_pmus[idx] = perf_pmus__find(dent->d_name); if (hisi_ptt_pmus[idx]) idx++; } @@ -131,7 +132,7 @@ struct auxtrace_record if (!evlist) return NULL; - cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME); + cs_etm_pmu = perf_pmus__find(CORESIGHT_ETM_PMU_NAME); arm_spe_pmus = find_all_arm_spe_pmus(&nr_spes, err); hisi_ptt_pmus = find_all_hisi_ptt_pmus(&nr_ptts, err); diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 9ca040bfb1aa..7c51fa182b51 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -25,7 +25,7 @@ #include "../../../util/evsel.h" #include "../../../util/perf_api_probe.h" #include "../../../util/evsel_config.h" -#include "../../../util/pmu.h" +#include "../../../util/pmus.h" #include "../../../util/cs-etm.h" #include // page_size #include "../../../util/session.h" @@ -881,7 +881,7 @@ struct auxtrace_record *cs_etm_record_init(int *err) struct perf_pmu *cs_etm_pmu; struct cs_etm_recording *ptr; - cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME); + cs_etm_pmu = perf_pmus__find(CORESIGHT_ETM_PMU_NAME); if (!cs_etm_pmu) { *err = -EINVAL; diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c index ef1ed645097c..2504d43a39a7 100644 --- a/tools/perf/arch/arm64/util/pmu.c +++ b/tools/perf/arch/arm64/util/pmu.c @@ -3,6 +3,7 @@ #include #include "../../../util/cpumap.h" #include "../../../util/pmu.h" +#include "../../../util/pmus.h" #include #include @@ -10,7 +11,7 @@ static struct perf_pmu *pmu__find_core_pmu(void) { struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!is_pmu_core(pmu->name)) continue; diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index 944bd1b4bab6..e466735d68d5 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -4,6 +4,7 @@ #include "evlist.h" #include "evsel.h" #include "pmu.h" +#include "pmus.h" #include "tests/tests.h" static bool test_config(const struct evsel *evsel, __u64 expected_config) @@ -113,7 +114,7 @@ static int test__hybrid_raw1(struct evlist *evlist) struct perf_evsel *evsel; perf_evlist__for_each_evsel(&evlist->core, evsel) { - struct perf_pmu *pmu = perf_pmu__find_by_type(evsel->attr.type); + struct perf_pmu *pmu = perf_pmus__find_by_type(evsel->attr.type); TEST_ASSERT_VAL("missing pmu", pmu); TEST_ASSERT_VAL("unexpected pmu", !strncmp(pmu->name, "cpu_", 4)); @@ -280,7 +281,7 @@ static int test_events(const struct evlist_test *events, int cnt) int test__hybrid(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - if (!perf_pmu__has_hybrid()) + if (!perf_pmus__has_hybrid()) return TEST_SKIP; return test_events(test__hybrid_events, ARRAY_SIZE(test__hybrid_events)); diff --git a/tools/perf/arch/x86/util/auxtrace.c b/tools/perf/arch/x86/util/auxtrace.c index 330d03216b0e..354780ff1605 100644 --- a/tools/perf/arch/x86/util/auxtrace.c +++ b/tools/perf/arch/x86/util/auxtrace.c @@ -10,6 +10,7 @@ #include "../../../util/header.h" #include "../../../util/debug.h" #include "../../../util/pmu.h" +#include "../../../util/pmus.h" #include "../../../util/auxtrace.h" #include "../../../util/intel-pt.h" #include "../../../util/intel-bts.h" @@ -25,8 +26,8 @@ struct auxtrace_record *auxtrace_record__init_intel(struct evlist *evlist, bool found_pt = false; bool found_bts = false; - intel_pt_pmu = perf_pmu__find(INTEL_PT_PMU_NAME); - intel_bts_pmu = perf_pmu__find(INTEL_BTS_PMU_NAME); + intel_pt_pmu = perf_pmus__find(INTEL_PT_PMU_NAME); + intel_bts_pmu = perf_pmus__find(INTEL_BTS_PMU_NAME); evlist__for_each_entry(evlist, evsel) { if (intel_pt_pmu && evsel->core.attr.type == intel_pt_pmu->type) diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c index 03f7eb4cf0a4..03240c640c7f 100644 --- a/tools/perf/arch/x86/util/evlist.c +++ b/tools/perf/arch/x86/util/evlist.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include "util/pmu.h" +#include "util/pmus.h" #include "util/evlist.h" #include "util/parse-events.h" #include "util/event.h" @@ -17,7 +18,7 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, for (i = 0; i < nr_attrs; i++) event_attr_init(attrs + i); - if (!perf_pmu__has_hybrid()) + if (!perf_pmus__has_hybrid()) return evlist__add_attrs(evlist, attrs, nr_attrs); for (i = 0; i < nr_attrs; i++) { @@ -32,7 +33,7 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, continue; } - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { struct perf_cpu_map *cpus; struct evsel *evsel; diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index 153cdca94cd4..25da46c8cca9 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -4,6 +4,7 @@ #include "util/evsel.h" #include "util/env.h" #include "util/pmu.h" +#include "util/pmus.h" #include "linux/string.h" #include "evsel.h" #include "util/debug.h" @@ -30,7 +31,7 @@ bool evsel__sys_has_perf_metrics(const struct evsel *evsel) * should be good enough to detect the perf metrics feature. */ if ((evsel->core.attr.type == PERF_TYPE_RAW) && - pmu_have_event(pmu_name, "slots")) + perf_pmus__have_event(pmu_name, "slots")) return true; return false; @@ -98,8 +99,8 @@ void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr) if (!evsel_pmu) return; - ibs_fetch_pmu = perf_pmu__find("ibs_fetch"); - ibs_op_pmu = perf_pmu__find("ibs_op"); + ibs_fetch_pmu = perf_pmus__find("ibs_fetch"); + ibs_op_pmu = perf_pmus__find("ibs_op"); if (ibs_fetch_pmu && ibs_fetch_pmu->type == evsel_pmu->type) { if (attr->config & IBS_FETCH_L3MISSONLY) { diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 439c2956f3e7..d2c8cac11470 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -17,7 +17,7 @@ #include "../../../util/evlist.h" #include "../../../util/mmap.h" #include "../../../util/session.h" -#include "../../../util/pmu.h" +#include "../../../util/pmus.h" #include "../../../util/debug.h" #include "../../../util/record.h" #include "../../../util/tsc.h" @@ -416,7 +416,7 @@ out_err: struct auxtrace_record *intel_bts_recording_init(int *err) { - struct perf_pmu *intel_bts_pmu = perf_pmu__find(INTEL_BTS_PMU_NAME); + struct perf_pmu *intel_bts_pmu = perf_pmus__find(INTEL_BTS_PMU_NAME); struct intel_bts_recording *btsr; if (!intel_bts_pmu) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 17336da08b58..74b70fd379df 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -23,7 +23,7 @@ #include "../../../util/mmap.h" #include #include "../../../util/parse-events.h" -#include "../../../util/pmu.h" +#include "../../../util/pmus.h" #include "../../../util/debug.h" #include "../../../util/auxtrace.h" #include "../../../util/perf_api_probe.h" @@ -1185,7 +1185,7 @@ static u64 intel_pt_reference(struct auxtrace_record *itr __maybe_unused) struct auxtrace_record *intel_pt_recording_init(int *err) { - struct perf_pmu *intel_pt_pmu = perf_pmu__find(INTEL_PT_PMU_NAME); + struct perf_pmu *intel_pt_pmu = perf_pmus__find(INTEL_PT_PMU_NAME); struct intel_pt_recording *ptr; if (!intel_pt_pmu) diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c index 02d65e446f46..32879d12a8d5 100644 --- a/tools/perf/arch/x86/util/mem-events.c +++ b/tools/perf/arch/x86/util/mem-events.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "util/pmu.h" +#include "util/pmus.h" #include "util/env.h" #include "map_symbol.h" #include "mem-events.h" @@ -55,12 +56,12 @@ struct perf_mem_event *perf_mem_events__ptr(int i) bool is_mem_loads_aux_event(struct evsel *leader) { - struct perf_pmu *pmu = perf_pmu__find("cpu"); + struct perf_pmu *pmu = perf_pmus__find("cpu"); if (!pmu) - pmu = perf_pmu__find("cpu_core"); + pmu = perf_pmus__find("cpu_core"); - if (pmu && !pmu_have_event(pmu->name, "mem-loads-aux")) + if (pmu && !perf_pmu__have_event(pmu, "mem-loads-aux")) return false; return leader->core.attr.config == MEM_LOADS_AUX; @@ -82,7 +83,7 @@ char *perf_mem_events__name(int i, char *pmu_name) pmu_name = (char *)"cpu"; } - if (pmu_have_event(pmu_name, "mem-loads-aux")) { + if (perf_pmus__have_event(pmu_name, "mem-loads-aux")) { scnprintf(mem_loads_name, sizeof(mem_loads_name), MEM_LOADS_AUX_NAME, pmu_name, pmu_name, perf_mem_events__loads_ldlat); diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index 26abc159fc0e..befa7f3659b9 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -10,6 +10,7 @@ #include "../../../util/debug.h" #include "../../../util/event.h" #include "../../../util/pmu.h" +#include "../../../util/pmus.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG(AX, PERF_REG_X86_AX), @@ -291,7 +292,7 @@ uint64_t arch__intr_reg_mask(void) */ attr.sample_period = 1; - if (perf_pmu__has_hybrid()) { + if (perf_pmus__has_hybrid()) { struct perf_pmu *pmu = NULL; __u64 type = PERF_TYPE_RAW; @@ -299,7 +300,7 @@ uint64_t arch__intr_reg_mask(void) * The same register set is supported among different hybrid PMUs. * Only check the first available one. */ - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (pmu->is_core) { type = pmu->type; break; diff --git a/tools/perf/arch/x86/util/topdown.c b/tools/perf/arch/x86/util/topdown.c index 9ad5e5c7bd27..3f9a267d4501 100644 --- a/tools/perf/arch/x86/util/topdown.c +++ b/tools/perf/arch/x86/util/topdown.c @@ -2,6 +2,7 @@ #include "api/fs/fs.h" #include "util/evsel.h" #include "util/pmu.h" +#include "util/pmus.h" #include "util/topdown.h" #include "topdown.h" #include "evsel.h" @@ -22,8 +23,8 @@ bool topdown_sys_has_perf_metrics(void) * The slots event is only available when the core PMU * supports the perf metrics feature. */ - pmu = perf_pmu__find_by_type(PERF_TYPE_RAW); - if (pmu && pmu_have_event(pmu->name, "slots")) + pmu = perf_pmus__find_by_type(PERF_TYPE_RAW); + if (pmu && perf_pmu__have_event(pmu, "slots")) has_perf_metrics = true; cached = true; diff --git a/tools/perf/bench/pmu-scan.c b/tools/perf/bench/pmu-scan.c index f4a6c37cbe27..51cae2d03353 100644 --- a/tools/perf/bench/pmu-scan.c +++ b/tools/perf/bench/pmu-scan.c @@ -44,7 +44,7 @@ static int save_result(void) struct list_head *list; struct pmu_scan_result *r; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { r = realloc(results, (nr_pmus + 1) * sizeof(*r)); if (r == NULL) return -ENOMEM; @@ -68,7 +68,7 @@ static int save_result(void) nr_pmus++; } - perf_pmu__destroy(); + perf_pmus__destroy(); return 0; } @@ -81,7 +81,7 @@ static int check_result(void) for (int i = 0; i < nr_pmus; i++) { r = &results[i]; - pmu = perf_pmu__find(r->name); + pmu = perf_pmus__find(r->name); if (pmu == NULL) { pr_err("Cannot find PMU %s\n", r->name); return -1; @@ -144,7 +144,7 @@ static int run_pmu_scan(void) for (i = 0; i < iterations; i++) { gettimeofday(&start, NULL); - perf_pmu__scan(NULL); + perf_pmus__scan(NULL); gettimeofday(&end, NULL); timersub(&end, &start, &diff); @@ -152,7 +152,7 @@ static int run_pmu_scan(void) update_stats(&stats, runtime_us); ret = check_result(); - perf_pmu__destroy(); + perf_pmus__destroy(); if (ret < 0) break; } diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 2757ccc19c5e..05dfd98af170 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -41,7 +41,7 @@ #include "symbol.h" #include "ui/ui.h" #include "ui/progress.h" -#include "pmu.h" +#include "pmus.h" #include "string2.h" #include "util/util.h" @@ -3259,7 +3259,7 @@ static int perf_c2c__record(int argc, const char **argv) PARSE_OPT_KEEP_UNKNOWN); /* Max number of arguments multiplied by number of PMUs that can support them. */ - rec_argc = argc + 11 * perf_pmu__num_mem_pmus(); + rec_argc = argc + 11 * perf_pmus__num_mem_pmus(); rec_argv = calloc(rec_argc + 1, sizeof(char *)); if (!rec_argv) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index e8520a027b45..03b5d26b2489 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -527,7 +527,7 @@ int cmd_list(int argc, const char **argv) strcmp(argv[i], "hwcache") == 0) print_hwcache_events(&print_cb, ps); else if (strcmp(argv[i], "pmu") == 0) - print_pmu_events(&print_cb, ps); + perf_pmus__print_pmu_events(&print_cb, ps); else if (strcmp(argv[i], "sdt") == 0) print_sdt_events(&print_cb, ps); else if (strcmp(argv[i], "metric") == 0 || strcmp(argv[i], "metrics") == 0) { @@ -567,7 +567,7 @@ int cmd_list(int argc, const char **argv) event_symbols_sw, PERF_COUNT_SW_MAX); print_tool_events(&print_cb, ps); print_hwcache_events(&print_cb, ps); - print_pmu_events(&print_cb, ps); + perf_pmus__print_pmu_events(&print_cb, ps); print_tracepoint_events(&print_cb, ps); print_sdt_events(&print_cb, ps); default_ps.metrics = true; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index f4f1ff76d49d..960bfd4b732a 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -17,7 +17,7 @@ #include "util/dso.h" #include "util/map.h" #include "util/symbol.h" -#include "util/pmu.h" +#include "util/pmus.h" #include "util/sample.h" #include "util/string2.h" #include "util/util.h" @@ -93,7 +93,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) PARSE_OPT_KEEP_UNKNOWN); /* Max number of arguments multiplied by number of PMUs that can support them. */ - rec_argc = argc + 9 * perf_pmu__num_mem_pmus(); + rec_argc = argc + 9 * perf_pmus__num_mem_pmus(); if (mem->cpu_list) rec_argc += 2; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2abcad2998f6..4b9212f75493 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -48,6 +48,8 @@ #include "util/bpf-event.h" #include "util/util.h" #include "util/pfm.h" +#include "util/pmu.h" +#include "util/pmus.h" #include "util/clockid.h" #include "util/off_cpu.h" #include "util/bpf-filter.h" @@ -1292,7 +1294,7 @@ static int record__open(struct record *rec) * of waiting or event synthesis. */ if (opts->target.initial_delay || target__has_cpu(&opts->target) || - perf_pmu__has_hybrid()) { + perf_pmus__has_hybrid()) { pos = evlist__get_tracking_event(evlist); if (!evsel__is_dummy_event(pos)) { /* Set up dummy event. */ @@ -2191,7 +2193,7 @@ static void record__uniquify_name(struct record *rec) char *new_name; int ret; - if (!perf_pmu__has_hybrid()) + if (!perf_pmus__has_hybrid()) return; evlist__for_each_entry(evlist, pos) { diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 62bbeea93bf3..c87c6897edc9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2140,11 +2140,11 @@ static int add_default_attributes(void) if (evlist__add_default_attrs(evsel_list, default_attrs0) < 0) return -1; - if (pmu_have_event("cpu", "stalled-cycles-frontend")) { + if (perf_pmus__have_event("cpu", "stalled-cycles-frontend")) { if (evlist__add_default_attrs(evsel_list, frontend_attrs) < 0) return -1; } - if (pmu_have_event("cpu", "stalled-cycles-backend")) { + if (perf_pmus__have_event("cpu", "stalled-cycles-backend")) { if (evlist__add_default_attrs(evsel_list, backend_attrs) < 0) return -1; } diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index 56fba08a3037..674876e6c8e6 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -34,7 +34,7 @@ #include "event.h" #include "util.h" #include "tests.h" -#include "pmu.h" +#include "pmus.h" #define ENV "PERF_TEST_ATTR" @@ -185,7 +185,7 @@ static int test__attr(struct test_suite *test __maybe_unused, int subtest __mayb char path_dir[PATH_MAX]; char *exec_path; - if (perf_pmu__has_hybrid()) + if (perf_pmus__has_hybrid()) return TEST_SKIP; /* First try development tree tests. */ diff --git a/tools/perf/tests/event_groups.c b/tools/perf/tests/event_groups.c index 3d9a2b524bba..ccd9d8b2903f 100644 --- a/tools/perf/tests/event_groups.c +++ b/tools/perf/tests/event_groups.c @@ -53,7 +53,7 @@ static int setup_uncore_event(void) struct perf_pmu *pmu = NULL; int i, fd; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { for (i = 0; i < NR_UNCORE_PMUS; i++) { if (!strcmp(uncore_pmus[i].name, pmu->name)) { pr_debug("Using %s for uncore pmu event\n", pmu->name); diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 277607ede060..9d05bc551791 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -112,7 +112,7 @@ static int test__checkevent_raw(struct evlist *evlist) bool type_matched = false; TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, 0x1a)); - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (pmu->type == evsel->attr.type) { TEST_ASSERT_VAL("PMU type expected once", !type_matched); type_matched = true; @@ -1443,12 +1443,12 @@ static int test__checkevent_config_cache(struct evlist *evlist) static bool test__pmu_cpu_valid(void) { - return !!perf_pmu__find("cpu"); + return !!perf_pmus__find("cpu"); } static bool test__intel_pt_valid(void) { - return !!perf_pmu__find("intel_pt"); + return !!perf_pmus__find("intel_pt"); } static int test__intel_pt(struct evlist *evlist) @@ -2246,7 +2246,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest struct perf_pmu *pmu = NULL; int ret = TEST_OK; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { struct stat st; char path[PATH_MAX]; struct dirent *ent; diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c index c05148ea400c..1d6493a5a956 100644 --- a/tools/perf/tests/parse-metric.c +++ b/tools/perf/tests/parse-metric.c @@ -11,7 +11,7 @@ #include "debug.h" #include "expr.h" #include "stat.h" -#include "pmu.h" +#include "pmus.h" struct value { const char *event; @@ -303,7 +303,7 @@ static int test__parse_metric(struct test_suite *test __maybe_unused, int subtes TEST_ASSERT_VAL("recursion fail failed", test_recursion_fail() == 0); TEST_ASSERT_VAL("Memory bandwidth", test_memory_bandwidth() == 0); - if (!perf_pmu__has_hybrid()) { + if (!perf_pmus__has_hybrid()) { TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0); TEST_ASSERT_VAL("test metric group", test_metric_group() == 0); } diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index 734004f1a37d..64ecb7845af4 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -2,6 +2,7 @@ #include "math.h" #include "parse-events.h" #include "pmu.h" +#include "pmus.h" #include "tests.h" #include #include @@ -708,7 +709,7 @@ static int test__aliases(struct test_suite *test __maybe_unused, struct perf_pmu *pmu = NULL; unsigned long i; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { int count = 0; if (!is_pmu_core(pmu->name)) diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index b3bd14b025a8..cff6ab87b2f6 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -20,7 +20,7 @@ #include "tests.h" #include "util/mmap.h" #include "util/sample.h" -#include "pmu.h" +#include "pmus.h" static int spin_sleep(void) { @@ -375,7 +375,7 @@ static int test__switch_tracking(struct test_suite *test __maybe_unused, int sub cpu_clocks_evsel = evlist__last(evlist); /* Second event */ - if (perf_pmu__has_hybrid()) { + if (perf_pmus__has_hybrid()) { cycles = "cpu_core/cycles/u"; err = parse_event(evlist, cycles); if (err) { diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index c4630cfc80ea..49e80d15420b 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -8,7 +8,7 @@ #include "session.h" #include "evlist.h" #include "debug.h" -#include "pmu.h" +#include "pmus.h" #include #define TEMPL "/tmp/perf-test-XXXXXX" @@ -41,7 +41,7 @@ static int session_write_header(char *path) session = perf_session__new(&data, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); - if (!perf_pmu__has_hybrid()) { + if (!perf_pmus__has_hybrid()) { session->evlist = evlist__new_default(); TEST_ASSERT_VAL("can't get evlist", session->evlist); } else { diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index a5c259bd5cc0..4578c26747e1 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -13,6 +13,7 @@ #include "debug.h" #include "env.h" #include "pmu.h" +#include "pmus.h" #define PACKAGE_CPUS_FMT \ "%s/devices/system/cpu/cpu%d/topology/package_cpus_list" @@ -473,10 +474,10 @@ struct hybrid_topology *hybrid_topology__new(void) struct hybrid_topology *tp = NULL; u32 nr = 0, i = 0; - if (!perf_pmu__has_hybrid()) + if (!perf_pmus__has_hybrid()) return NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (pmu->is_core) nr++; } @@ -488,7 +489,7 @@ struct hybrid_topology *hybrid_topology__new(void) return NULL; tp->nr = nr; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (!pmu->is_core) continue; diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 4a4fdad820d6..9eabf3ec56e9 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -10,6 +10,7 @@ #include #include #include +#include "pmus.h" #include "strbuf.h" struct perf_env perf_env; @@ -323,7 +324,7 @@ int perf_env__read_pmu_mappings(struct perf_env *env) u32 pmu_num = 0; struct strbuf sb; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->name) continue; pmu_num++; @@ -337,7 +338,7 @@ int perf_env__read_pmu_mappings(struct perf_env *env) if (strbuf_init(&sb, 128 * pmu_num) < 0) return -ENOMEM; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->name) continue; if (strbuf_addf(&sb, "%u:%s", pmu->type, pmu->name) < 0) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1c6e22e3f345..b4237fc713d5 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -48,6 +48,7 @@ #include "util/hashmap.h" #include "off_cpu.h" #include "pmu.h" +#include "pmus.h" #include "../perf-sys.h" #include "util/parse-branch-options.h" #include "util/bpf-filter.h" @@ -3139,7 +3140,7 @@ void evsel__zero_per_pkg(struct evsel *evsel) */ bool evsel__is_hybrid(const struct evsel *evsel) { - if (!perf_pmu__has_hybrid()) + if (!perf_pmus__has_hybrid()) return false; return evsel->core.is_pmu_core; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 37fa66b1ca77..e6d8ecd7a08e 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -37,6 +37,7 @@ #include "debug.h" #include "cpumap.h" #include "pmu.h" +#include "pmus.h" #include "vdso.h" #include "strbuf.h" #include "build-id.h" @@ -744,7 +745,7 @@ static int write_pmu_mappings(struct feat_fd *ff, * Do a first pass to count number of pmu to avoid lseek so this * works in pipe mode as well. */ - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->name) continue; pmu_num++; @@ -754,7 +755,7 @@ static int write_pmu_mappings(struct feat_fd *ff, if (ret < 0) return ret; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->name) continue; @@ -1566,7 +1567,7 @@ static int __write_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu, static int write_cpu_pmu_caps(struct feat_fd *ff, struct evlist *evlist __maybe_unused) { - struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + struct perf_pmu *cpu_pmu = perf_pmus__find("cpu"); int ret; if (!cpu_pmu) @@ -1586,7 +1587,7 @@ static int write_pmu_caps(struct feat_fd *ff, int nr_pmu = 0; int ret; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->name || !strcmp(pmu->name, "cpu") || perf_pmu__caps_parse(pmu) <= 0) continue; @@ -1604,9 +1605,9 @@ static int write_pmu_caps(struct feat_fd *ff, * Write hybrid pmu caps first to maintain compatibility with * older perf tool. */ - if (perf_pmu__has_hybrid()) { + if (perf_pmus__has_hybrid()) { pmu = NULL; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->is_core) continue; @@ -1617,7 +1618,7 @@ static int write_pmu_caps(struct feat_fd *ff, } pmu = NULL; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (pmu->is_core || !pmu->nr_caps) continue; diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index c9e422a38258..08ac3ea2e366 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -13,6 +13,7 @@ #include "debug.h" #include "symbol.h" #include "pmu.h" +#include "pmus.h" unsigned int perf_mem_events__loads_ldlat = 30; @@ -128,14 +129,14 @@ int perf_mem_events__init(void) if (!e->tag) continue; - if (!perf_pmu__has_hybrid()) { + if (!perf_pmus__has_hybrid()) { scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, "cpu"); e->supported = perf_mem_event__supported(mnt, sysfs_name); } else { struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (!pmu->is_core) continue; @@ -175,7 +176,7 @@ static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, char sysfs_name[100]; struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (!pmu->is_core) continue; @@ -201,7 +202,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, if (!e->record) continue; - if (!perf_pmu__has_hybrid()) { + if (!perf_pmus__has_hybrid()) { if (!e->supported) { pr_err("failed: event '%s' not supported\n", perf_mem_events__name(j, NULL)); @@ -216,7 +217,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, return -1; } - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (!pmu->is_core) continue; rec_argv[i++] = "-e"; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 3f04a686d1cd..092ed6386a39 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -11,6 +11,7 @@ #include "evsel.h" #include "strbuf.h" #include "pmu.h" +#include "pmus.h" #include "print-events.h" #include "smt.h" #include "expr.h" @@ -273,7 +274,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, const char *metric_id; struct evsel *ev; size_t ids_size, matched_events, i; - bool all_pmus = !strcmp(pmu, "all") || !perf_pmu__has_hybrid() || !is_pmu_hybrid(pmu); + bool all_pmus = !strcmp(pmu, "all") || !perf_pmus__has_hybrid() || !is_pmu_hybrid(pmu); *out_metric_events = NULL; ids_size = hashmap__size(ids); @@ -488,7 +489,7 @@ static int metricgroup__sys_event_iter(const struct pmu_metric *pm, if (!pm->metric_expr || !pm->compat) return 0; - while ((pmu = perf_pmu__scan(pmu))) { + while ((pmu = perf_pmus__scan(pmu))) { if (!pmu->id || strcmp(pmu->id, pm->compat)) continue; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1a0be395c887..be544f948be2 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -21,6 +21,7 @@ #include "parse-events-bison.h" #include "parse-events-flex.h" #include "pmu.h" +#include "pmus.h" #include "asm/bug.h" #include "util/parse-branch-options.h" #include "util/evsel_config.h" @@ -452,7 +453,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, const char *config_name = get_config_name(head_config); const char *metric_id = get_config_metric_id(head_config); - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { LIST_HEAD(config_terms); struct perf_event_attr attr; int ret; @@ -1193,7 +1194,7 @@ static int config_term_pmu(struct perf_event_attr *attr, struct parse_events_error *err) { if (term->type_term == PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE) { - const struct perf_pmu *pmu = perf_pmu__find_by_type(attr->type); + const struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); if (perf_pmu__supports_legacy_cache(pmu)) { attr->type = PERF_TYPE_HW_CACHE; @@ -1203,7 +1204,7 @@ static int config_term_pmu(struct perf_event_attr *attr, term->type_term = PARSE_EVENTS__TERM_TYPE_USER; } if (term->type_term == PARSE_EVENTS__TERM_TYPE_HARDWARE) { - const struct perf_pmu *pmu = perf_pmu__find_by_type(attr->type); + const struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); if (!pmu) { pr_debug("Failed to find PMU for type %d", attr->type); @@ -1480,7 +1481,7 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, return __parse_events_add_numeric(parse_state, list, /*pmu=*/NULL, type, config, head_config); - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { int ret; if (!perf_pmu__supports_wildcard_numeric(pmu)) @@ -1529,7 +1530,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, struct parse_events_error *err = parse_state->error; LIST_HEAD(config_terms); - pmu = parse_state->fake_pmu ?: perf_pmu__find(name); + pmu = parse_state->fake_pmu ?: perf_pmus__find(name); if (verbose > 1 && !(pmu && pmu->selectable)) { fprintf(stderr, "Attempting to add event pmu '%s' with '", @@ -1674,7 +1675,7 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, INIT_LIST_HEAD(list); - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { struct perf_pmu_alias *alias; bool auto_merge_stats; @@ -2410,7 +2411,7 @@ static int set_filter(struct evsel *evsel, const void *arg) return 0; } - while ((pmu = perf_pmu__scan(pmu)) != NULL) + while ((pmu = perf_pmus__scan(pmu)) != NULL) if (pmu->type == evsel->core.attr.type) { found = true; break; diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 4e1f5de35be8..abd6ab460e12 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -15,6 +15,7 @@ #include #include #include "pmu.h" +#include "pmus.h" #include "evsel.h" #include "parse-events.h" #include "parse-events-bison.h" @@ -316,7 +317,7 @@ PE_NAME opt_pmu_config if (asprintf(&pattern, "%s*", $1) < 0) CLEANUP_YYABORT; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { char *name = pmu->name; if (parse_events__filter_pmu(parse_state, pmu)) diff --git a/tools/perf/util/pfm.c b/tools/perf/util/pfm.c index 6c11914c179f..076aecc22c16 100644 --- a/tools/perf/util/pfm.c +++ b/tools/perf/util/pfm.c @@ -10,7 +10,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/parse-events.h" -#include "util/pmu.h" +#include "util/pmus.h" #include "util/pfm.h" #include "util/strbuf.h" @@ -49,7 +49,7 @@ int parse_libpfm_events_option(const struct option *opt, const char *str, /* * force loading of the PMU list */ - perf_pmu__scan(NULL); + perf_pmus__scan(NULL); for (q = p; strsep(&p, ",{}"); q = p) { sep = p ? str + (p - p_orig - 1) : ""; @@ -86,7 +86,7 @@ int parse_libpfm_events_option(const struct option *opt, const char *str, goto error; } - pmu = perf_pmu__find_by_type((unsigned int)attr.type); + pmu = perf_pmus__find_by_type((unsigned int)attr.type); evsel = parse_events__add_event(evlist->core.nr_entries, &attr, q, /*metric_id=*/NULL, pmu); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 21ee23b78f5a..05056305fb58 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -4,20 +4,15 @@ #include #include #include -#include #include -#include #include #include #include #include #include -#include #include #include #include -#include -#include #include #include #include "debug.h" @@ -59,8 +54,6 @@ struct perf_pmu_format { struct list_head list; }; -static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name); - /* * Parse & process all the sysfs attributes located under * the directory specified in 'dir' parameter. @@ -554,31 +547,6 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias, return 0; } -/* Add all pmus in sysfs to pmu list: */ -static void pmu_read_sysfs(void) -{ - int fd; - DIR *dir; - struct dirent *dent; - - fd = perf_pmu__event_source_devices_fd(); - if (fd < 0) - return; - - dir = fdopendir(fd); - if (!dir) - return; - - while ((dent = readdir(dir))) { - if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) - continue; - /* add to static LIST_HEAD(pmus): */ - perf_pmu__find2(fd, dent->d_name); - } - - closedir(dir); -} - /* * Uncore PMUs have a "cpumask" file under sysfs. CPU PMUs (e.g. on arm/arm64) * may have a "cpus" file. @@ -894,7 +862,7 @@ static int pmu_max_precise(int dirfd, struct perf_pmu *pmu) return max_precise; } -static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) +struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name) { struct perf_pmu *pmu; LIST_HEAD(format); @@ -951,7 +919,7 @@ static struct perf_pmu *pmu_lookup(int dirfd, const char *lookup_name) INIT_LIST_HEAD(&pmu->caps); list_splice(&format, &pmu->format); list_splice(&aliases, &pmu->aliases); - list_add_tail(&pmu->list, &pmus); + list_add_tail(&pmu->list, pmus); pmu->default_config = perf_pmu__get_default_config(pmu); @@ -979,61 +947,6 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) } } -static struct perf_pmu *pmu_find(const char *name) -{ - struct perf_pmu *pmu; - - list_for_each_entry(pmu, &pmus, list) { - if (!strcmp(pmu->name, name) || - (pmu->alias_name && !strcmp(pmu->alias_name, name))) - return pmu; - } - - return NULL; -} - -struct perf_pmu *perf_pmu__find_by_type(unsigned int type) -{ - struct perf_pmu *pmu; - - list_for_each_entry(pmu, &pmus, list) - if (pmu->type == type) - return pmu; - - return NULL; -} - -struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu) -{ - /* - * pmu iterator: If pmu is NULL, we start at the begin, - * otherwise return the next pmu. Returns NULL on end. - */ - if (!pmu) { - pmu_read_sysfs(); - pmu = list_prepare_entry(pmu, &pmus, list); - } - list_for_each_entry_continue(pmu, &pmus, list) - return pmu; - return NULL; -} - -struct perf_pmu *evsel__find_pmu(const struct evsel *evsel) -{ - struct perf_pmu *pmu = NULL; - - if (evsel->pmu) - return evsel->pmu; - - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (pmu->type == evsel->core.attr.type) - break; - } - - ((struct evsel *)evsel)->pmu = pmu; - return pmu; -} - bool evsel__is_aux_event(const struct evsel *evsel) { struct perf_pmu *pmu = evsel__find_pmu(evsel); @@ -1070,43 +983,6 @@ void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel, evsel->core.attr.config |= field_prep(bits, val); } -struct perf_pmu *perf_pmu__find(const char *name) -{ - struct perf_pmu *pmu; - int dirfd; - - /* - * Once PMU is loaded it stays in the list, - * so we keep us from multiple reading/parsing - * the pmu format definitions. - */ - pmu = pmu_find(name); - if (pmu) - return pmu; - - dirfd = perf_pmu__event_source_devices_fd(); - pmu = pmu_lookup(dirfd, name); - close(dirfd); - - return pmu; -} - -static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) -{ - struct perf_pmu *pmu; - - /* - * Once PMU is loaded it stays in the list, - * so we keep us from multiple reading/parsing - * the pmu format definitions. - */ - pmu = pmu_find(name); - if (pmu) - return pmu; - - return pmu_lookup(dirfd, name); -} - static struct perf_pmu_format * pmu_find_format(struct list_head *formats, const char *name) { @@ -1536,99 +1412,6 @@ void perf_pmu__del_formats(struct list_head *formats) } } -static int sub_non_neg(int a, int b) -{ - if (b > a) - return 0; - return a - b; -} - -static char *format_alias(char *buf, int len, const struct perf_pmu *pmu, - const struct perf_pmu_alias *alias) -{ - struct parse_events_term *term; - int used = snprintf(buf, len, "%s/%s", pmu->name, alias->name); - - list_for_each_entry(term, &alias->terms, list) { - if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) - used += snprintf(buf + used, sub_non_neg(len, used), - ",%s=%s", term->config, - term->val.str); - } - - if (sub_non_neg(len, used) > 0) { - buf[used] = '/'; - used++; - } - if (sub_non_neg(len, used) > 0) { - buf[used] = '\0'; - used++; - } else - buf[len - 1] = '\0'; - - return buf; -} - -/** Struct for ordering events as output in perf list. */ -struct sevent { - /** PMU for event. */ - const struct perf_pmu *pmu; - /** - * Optional event for name, desc, etc. If not present then this is a - * selectable PMU and the event name is shown as "//". - */ - const struct perf_pmu_alias *event; - /** Is the PMU for the CPU? */ - bool is_cpu; -}; - -static int cmp_sevent(const void *a, const void *b) -{ - const struct sevent *as = a; - const struct sevent *bs = b; - const char *a_pmu_name = NULL, *b_pmu_name = NULL; - const char *a_name = "//", *a_desc = NULL, *a_topic = ""; - const char *b_name = "//", *b_desc = NULL, *b_topic = ""; - int ret; - - if (as->event) { - a_name = as->event->name; - a_desc = as->event->desc; - a_topic = as->event->topic ?: ""; - a_pmu_name = as->event->pmu_name; - } - if (bs->event) { - b_name = bs->event->name; - b_desc = bs->event->desc; - b_topic = bs->event->topic ?: ""; - b_pmu_name = bs->event->pmu_name; - } - /* Put extra events last. */ - if (!!a_desc != !!b_desc) - return !!a_desc - !!b_desc; - - /* Order by topics. */ - ret = strcmp(a_topic, b_topic); - if (ret) - return ret; - - /* Order CPU core events to be first */ - if (as->is_cpu != bs->is_cpu) - return as->is_cpu ? -1 : 1; - - /* Order by PMU name. */ - if (as->pmu != bs->pmu) { - a_pmu_name = a_pmu_name ?: (as->pmu->name ?: ""); - b_pmu_name = b_pmu_name ?: (bs->pmu->name ?: ""); - ret = strcmp(a_pmu_name, b_pmu_name); - if (ret) - return ret; - } - - /* Order by event name. */ - return strcmp(a_name, b_name); -} - bool is_pmu_core(const char *name) { return !strcmp(name, "cpu") || is_sysfs_pmu_core(name); @@ -1654,167 +1437,18 @@ bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) return !is_pmu_hybrid(pmu->name); } -static bool perf_pmu__is_mem_pmu(const struct perf_pmu *pmu) +bool perf_pmu__is_mem_pmu(const struct perf_pmu *pmu) { return pmu->is_core; } -int perf_pmu__num_mem_pmus(void) -{ - struct perf_pmu *pmu = NULL; - int count = 0; - - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (perf_pmu__is_mem_pmu(pmu)) - count++; - } - return count; -} - -static bool pmu_alias_is_duplicate(struct sevent *alias_a, - struct sevent *alias_b) -{ - const char *a_pmu_name = NULL, *b_pmu_name = NULL; - const char *a_name = "//", *b_name = "//"; - - - if (alias_a->event) { - a_name = alias_a->event->name; - a_pmu_name = alias_a->event->pmu_name; - } - if (alias_b->event) { - b_name = alias_b->event->name; - b_pmu_name = alias_b->event->pmu_name; - } - - /* Different names -> never duplicates */ - if (strcmp(a_name, b_name)) - return false; - - /* Don't remove duplicates for different PMUs */ - a_pmu_name = a_pmu_name ?: (alias_a->pmu->name ?: ""); - b_pmu_name = b_pmu_name ?: (alias_b->pmu->name ?: ""); - return strcmp(a_pmu_name, b_pmu_name) == 0; -} - -void print_pmu_events(const struct print_callbacks *print_cb, void *print_state) -{ - struct perf_pmu *pmu; - struct perf_pmu_alias *event; - char buf[1024]; - int printed = 0; - int len, j; - struct sevent *aliases; - - pmu = NULL; - len = 0; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - list_for_each_entry(event, &pmu->aliases, list) - len++; - if (pmu->selectable) - len++; - } - aliases = zalloc(sizeof(struct sevent) * len); - if (!aliases) { - pr_err("FATAL: not enough memory to print PMU events\n"); - return; - } - pmu = NULL; - j = 0; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - bool is_cpu = pmu->is_core; - - list_for_each_entry(event, &pmu->aliases, list) { - aliases[j].event = event; - aliases[j].pmu = pmu; - aliases[j].is_cpu = is_cpu; - j++; - } - if (pmu->selectable) { - aliases[j].event = NULL; - aliases[j].pmu = pmu; - aliases[j].is_cpu = is_cpu; - j++; - } - } - len = j; - qsort(aliases, len, sizeof(struct sevent), cmp_sevent); - for (j = 0; j < len; j++) { - const char *name, *alias = NULL, *scale_unit = NULL, - *desc = NULL, *long_desc = NULL, - *encoding_desc = NULL, *topic = NULL, - *pmu_name = NULL; - bool deprecated = false; - size_t buf_used; - - /* Skip duplicates */ - if (j > 0 && pmu_alias_is_duplicate(&aliases[j], &aliases[j - 1])) - continue; - - if (!aliases[j].event) { - /* A selectable event. */ - pmu_name = aliases[j].pmu->name; - buf_used = snprintf(buf, sizeof(buf), "%s//", pmu_name) + 1; - name = buf; - } else { - if (aliases[j].event->desc) { - name = aliases[j].event->name; - buf_used = 0; - } else { - name = format_alias(buf, sizeof(buf), aliases[j].pmu, - aliases[j].event); - if (aliases[j].is_cpu) { - alias = name; - name = aliases[j].event->name; - } - buf_used = strlen(buf) + 1; - } - pmu_name = aliases[j].event->pmu_name ?: (aliases[j].pmu->name ?: ""); - if (strlen(aliases[j].event->unit) || aliases[j].event->scale != 1.0) { - scale_unit = buf + buf_used; - buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used, - "%G%s", aliases[j].event->scale, - aliases[j].event->unit) + 1; - } - desc = aliases[j].event->desc; - long_desc = aliases[j].event->long_desc; - topic = aliases[j].event->topic; - encoding_desc = buf + buf_used; - buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used, - "%s/%s/", pmu_name, aliases[j].event->str) + 1; - deprecated = aliases[j].event->deprecated; - } - print_cb->print_event(print_state, - pmu_name, - topic, - name, - alias, - scale_unit, - deprecated, - "Kernel PMU event", - desc, - long_desc, - encoding_desc); - } - if (printed && pager_in_use()) - printf("\n"); - - zfree(&aliases); - return; -} - -bool pmu_have_event(const char *pname, const char *name) +bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name) { - struct perf_pmu *pmu; struct perf_pmu_alias *alias; - pmu = NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (strcmp(pname, pmu->name)) - continue; - list_for_each_entry(alias, &pmu->aliases, list) - if (!strcmp(alias->name, name)) - return true; + list_for_each_entry(alias, &pmu->aliases, list) { + if (!strcmp(alias->name, name)) + return true; } return false; } @@ -2020,24 +1654,6 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, name ?: "N/A", buf, config); } -bool perf_pmu__has_hybrid(void) -{ - static bool hybrid_scanned, has_hybrid; - - if (!hybrid_scanned) { - struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (pmu->is_core && is_pmu_hybrid(pmu->name)) { - has_hybrid = true; - break; - } - } - hybrid_scanned = true; - } - return has_hybrid; -} - int perf_pmu__match(char *pattern, char *name, char *tok) { if (!name) @@ -2105,7 +1721,7 @@ int perf_pmu__pathname_fd(int dirfd, const char *pmu_name, const char *filename, return openat(dirfd, path, flags); } -static void perf_pmu__delete(struct perf_pmu *pmu) +void perf_pmu__delete(struct perf_pmu *pmu) { perf_pmu__del_formats(&pmu->format); perf_pmu__del_aliases(pmu); @@ -2118,14 +1734,3 @@ static void perf_pmu__delete(struct perf_pmu *pmu) zfree(&pmu->alias_name); free(pmu); } - -void perf_pmu__destroy(void) -{ - struct perf_pmu *pmu, *tmp; - - list_for_each_entry_safe(pmu, tmp, &pmus, list) { - list_del(&pmu->list); - - perf_pmu__delete(pmu); - } -} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index cb51ad6e40fa..f1f3e8a2e00e 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -198,8 +198,6 @@ struct perf_pmu_alias { char *pmu_name; }; -struct perf_pmu *perf_pmu__find(const char *name); -struct perf_pmu *perf_pmu__find_by_type(unsigned int type); void pmu_add_sys_aliases(struct list_head *head, struct perf_pmu *pmu); int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, struct list_head *head_terms, @@ -222,16 +220,13 @@ void perf_pmu__set_format(unsigned long *bits, long from, long to); int perf_pmu__format_parse(int dirfd, struct list_head *head); void perf_pmu__del_formats(struct list_head *formats); -struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu); - bool is_pmu_core(const char *name); bool is_pmu_hybrid(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu); bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); -int perf_pmu__num_mem_pmus(void); -void print_pmu_events(const struct print_callbacks *print_cb, void *print_state); -bool pmu_have_event(const char *pname, const char *name); +bool perf_pmu__is_mem_pmu(const struct perf_pmu *pmu); +bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name); FILE *perf_pmu__open_file(struct perf_pmu *pmu, const char *name); FILE *perf_pmu__open_file_at(struct perf_pmu *pmu, int dirfd, const char *name); @@ -261,7 +256,6 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, const char *name); void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu); -bool perf_pmu__has_hybrid(void); int perf_pmu__match(char *pattern, char *name, char *tok); char *pmu_find_real_name(const char *name); @@ -273,6 +267,7 @@ int perf_pmu__pathname_scnprintf(char *buf, size_t size, int perf_pmu__event_source_devices_fd(void); int perf_pmu__pathname_fd(int dirfd, const char *pmu_name, const char *filename, int flags); -void perf_pmu__destroy(void); +struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *lookup_name); +void perf_pmu__delete(struct perf_pmu *pmu); #endif /* __PMU_H */ diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 140e11f00b29..58ff7937e9b7 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -1,16 +1,136 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include +#include +#include #include +#include +#include "debug.h" +#include "evsel.h" #include "pmus.h" #include "pmu.h" +#include "print-events.h" -LIST_HEAD(pmus); +static LIST_HEAD(pmus); + +void perf_pmus__destroy(void) +{ + struct perf_pmu *pmu, *tmp; + + list_for_each_entry_safe(pmu, tmp, &pmus, list) { + list_del(&pmu->list); + + perf_pmu__delete(pmu); + } +} + +static struct perf_pmu *pmu_find(const char *name) +{ + struct perf_pmu *pmu; + + list_for_each_entry(pmu, &pmus, list) { + if (!strcmp(pmu->name, name) || + (pmu->alias_name && !strcmp(pmu->alias_name, name))) + return pmu; + } + + return NULL; +} + +struct perf_pmu *perf_pmus__find(const char *name) +{ + struct perf_pmu *pmu; + int dirfd; + + /* + * Once PMU is loaded it stays in the list, + * so we keep us from multiple reading/parsing + * the pmu format definitions. + */ + pmu = pmu_find(name); + if (pmu) + return pmu; + + dirfd = perf_pmu__event_source_devices_fd(); + pmu = perf_pmu__lookup(&pmus, dirfd, name); + close(dirfd); + + return pmu; +} + +static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) +{ + struct perf_pmu *pmu; + + /* + * Once PMU is loaded it stays in the list, + * so we keep us from multiple reading/parsing + * the pmu format definitions. + */ + pmu = pmu_find(name); + if (pmu) + return pmu; + + return perf_pmu__lookup(&pmus, dirfd, name); +} + +/* Add all pmus in sysfs to pmu list: */ +static void pmu_read_sysfs(void) +{ + int fd; + DIR *dir; + struct dirent *dent; + + fd = perf_pmu__event_source_devices_fd(); + if (fd < 0) + return; + + dir = fdopendir(fd); + if (!dir) + return; + + while ((dent = readdir(dir))) { + if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) + continue; + /* add to static LIST_HEAD(pmus): */ + perf_pmu__find2(fd, dent->d_name); + } + + closedir(dir); +} + +struct perf_pmu *perf_pmus__find_by_type(unsigned int type) +{ + struct perf_pmu *pmu; + + list_for_each_entry(pmu, &pmus, list) + if (pmu->type == type) + return pmu; + + return NULL; +} + +struct perf_pmu *perf_pmus__scan(struct perf_pmu *pmu) +{ + /* + * pmu iterator: If pmu is NULL, we start at the begin, + * otherwise return the next pmu. Returns NULL on end. + */ + if (!pmu) { + pmu_read_sysfs(); + pmu = list_prepare_entry(pmu, &pmus, list); + } + list_for_each_entry_continue(pmu, &pmus, list) + return pmu; + return NULL; +} const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str) { struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { if (!strcmp(pmu->name, str)) return pmu; /* Ignore "uncore_" prefix. */ @@ -26,3 +146,275 @@ const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str) } return NULL; } + +int perf_pmus__num_mem_pmus(void) +{ + struct perf_pmu *pmu = NULL; + int count = 0; + + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + if (perf_pmu__is_mem_pmu(pmu)) + count++; + } + return count; +} + +/** Struct for ordering events as output in perf list. */ +struct sevent { + /** PMU for event. */ + const struct perf_pmu *pmu; + /** + * Optional event for name, desc, etc. If not present then this is a + * selectable PMU and the event name is shown as "//". + */ + const struct perf_pmu_alias *event; + /** Is the PMU for the CPU? */ + bool is_cpu; +}; + +static int cmp_sevent(const void *a, const void *b) +{ + const struct sevent *as = a; + const struct sevent *bs = b; + const char *a_pmu_name = NULL, *b_pmu_name = NULL; + const char *a_name = "//", *a_desc = NULL, *a_topic = ""; + const char *b_name = "//", *b_desc = NULL, *b_topic = ""; + int ret; + + if (as->event) { + a_name = as->event->name; + a_desc = as->event->desc; + a_topic = as->event->topic ?: ""; + a_pmu_name = as->event->pmu_name; + } + if (bs->event) { + b_name = bs->event->name; + b_desc = bs->event->desc; + b_topic = bs->event->topic ?: ""; + b_pmu_name = bs->event->pmu_name; + } + /* Put extra events last. */ + if (!!a_desc != !!b_desc) + return !!a_desc - !!b_desc; + + /* Order by topics. */ + ret = strcmp(a_topic, b_topic); + if (ret) + return ret; + + /* Order CPU core events to be first */ + if (as->is_cpu != bs->is_cpu) + return as->is_cpu ? -1 : 1; + + /* Order by PMU name. */ + if (as->pmu != bs->pmu) { + a_pmu_name = a_pmu_name ?: (as->pmu->name ?: ""); + b_pmu_name = b_pmu_name ?: (bs->pmu->name ?: ""); + ret = strcmp(a_pmu_name, b_pmu_name); + if (ret) + return ret; + } + + /* Order by event name. */ + return strcmp(a_name, b_name); +} + +static bool pmu_alias_is_duplicate(struct sevent *alias_a, + struct sevent *alias_b) +{ + const char *a_pmu_name = NULL, *b_pmu_name = NULL; + const char *a_name = "//", *b_name = "//"; + + + if (alias_a->event) { + a_name = alias_a->event->name; + a_pmu_name = alias_a->event->pmu_name; + } + if (alias_b->event) { + b_name = alias_b->event->name; + b_pmu_name = alias_b->event->pmu_name; + } + + /* Different names -> never duplicates */ + if (strcmp(a_name, b_name)) + return false; + + /* Don't remove duplicates for different PMUs */ + a_pmu_name = a_pmu_name ?: (alias_a->pmu->name ?: ""); + b_pmu_name = b_pmu_name ?: (alias_b->pmu->name ?: ""); + return strcmp(a_pmu_name, b_pmu_name) == 0; +} + +static int sub_non_neg(int a, int b) +{ + if (b > a) + return 0; + return a - b; +} + +static char *format_alias(char *buf, int len, const struct perf_pmu *pmu, + const struct perf_pmu_alias *alias) +{ + struct parse_events_term *term; + int used = snprintf(buf, len, "%s/%s", pmu->name, alias->name); + + list_for_each_entry(term, &alias->terms, list) { + if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) + used += snprintf(buf + used, sub_non_neg(len, used), + ",%s=%s", term->config, + term->val.str); + } + + if (sub_non_neg(len, used) > 0) { + buf[used] = '/'; + used++; + } + if (sub_non_neg(len, used) > 0) { + buf[used] = '\0'; + used++; + } else + buf[len - 1] = '\0'; + + return buf; +} + +void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state) +{ + struct perf_pmu *pmu; + struct perf_pmu_alias *event; + char buf[1024]; + int printed = 0; + int len, j; + struct sevent *aliases; + + pmu = NULL; + len = 0; + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + list_for_each_entry(event, &pmu->aliases, list) + len++; + if (pmu->selectable) + len++; + } + aliases = zalloc(sizeof(struct sevent) * len); + if (!aliases) { + pr_err("FATAL: not enough memory to print PMU events\n"); + return; + } + pmu = NULL; + j = 0; + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + bool is_cpu = pmu->is_core; + + list_for_each_entry(event, &pmu->aliases, list) { + aliases[j].event = event; + aliases[j].pmu = pmu; + aliases[j].is_cpu = is_cpu; + j++; + } + if (pmu->selectable) { + aliases[j].event = NULL; + aliases[j].pmu = pmu; + aliases[j].is_cpu = is_cpu; + j++; + } + } + len = j; + qsort(aliases, len, sizeof(struct sevent), cmp_sevent); + for (j = 0; j < len; j++) { + const char *name, *alias = NULL, *scale_unit = NULL, + *desc = NULL, *long_desc = NULL, + *encoding_desc = NULL, *topic = NULL, + *pmu_name = NULL; + bool deprecated = false; + size_t buf_used; + + /* Skip duplicates */ + if (j > 0 && pmu_alias_is_duplicate(&aliases[j], &aliases[j - 1])) + continue; + + if (!aliases[j].event) { + /* A selectable event. */ + pmu_name = aliases[j].pmu->name; + buf_used = snprintf(buf, sizeof(buf), "%s//", pmu_name) + 1; + name = buf; + } else { + if (aliases[j].event->desc) { + name = aliases[j].event->name; + buf_used = 0; + } else { + name = format_alias(buf, sizeof(buf), aliases[j].pmu, + aliases[j].event); + if (aliases[j].is_cpu) { + alias = name; + name = aliases[j].event->name; + } + buf_used = strlen(buf) + 1; + } + pmu_name = aliases[j].event->pmu_name ?: (aliases[j].pmu->name ?: ""); + if (strlen(aliases[j].event->unit) || aliases[j].event->scale != 1.0) { + scale_unit = buf + buf_used; + buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used, + "%G%s", aliases[j].event->scale, + aliases[j].event->unit) + 1; + } + desc = aliases[j].event->desc; + long_desc = aliases[j].event->long_desc; + topic = aliases[j].event->topic; + encoding_desc = buf + buf_used; + buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used, + "%s/%s/", pmu_name, aliases[j].event->str) + 1; + deprecated = aliases[j].event->deprecated; + } + print_cb->print_event(print_state, + pmu_name, + topic, + name, + alias, + scale_unit, + deprecated, + "Kernel PMU event", + desc, + long_desc, + encoding_desc); + } + if (printed && pager_in_use()) + printf("\n"); + + zfree(&aliases); +} + +bool perf_pmus__have_event(const char *pname, const char *name) +{ + struct perf_pmu *pmu = perf_pmus__find(pname); + + return pmu && perf_pmu__have_event(pmu, name); +} + +bool perf_pmus__has_hybrid(void) +{ + static bool hybrid_scanned, has_hybrid; + + if (!hybrid_scanned) { + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + if (pmu->is_core && is_pmu_hybrid(pmu->name)) { + has_hybrid = true; + break; + } + } + hybrid_scanned = true; + } + return has_hybrid; +} + +struct perf_pmu *evsel__find_pmu(const struct evsel *evsel) +{ + struct perf_pmu *pmu = evsel->pmu; + + if (!pmu) { + pmu = perf_pmus__find_by_type(evsel->core.attr.type); + ((struct evsel *)evsel)->pmu = pmu; + } + return pmu; +} diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 257de10788e8..2a771d9f8da7 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -2,9 +2,21 @@ #ifndef __PMUS_H #define __PMUS_H -extern struct list_head pmus; struct perf_pmu; +struct print_callbacks; + +void perf_pmus__destroy(void); + +struct perf_pmu *perf_pmus__find(const char *name); +struct perf_pmu *perf_pmus__find_by_type(unsigned int type); + +struct perf_pmu *perf_pmus__scan(struct perf_pmu *pmu); const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); +int perf_pmus__num_mem_pmus(void); +void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state); +bool perf_pmus__have_event(const char *pname, const char *name); +bool perf_pmus__has_hybrid(void); + #endif /* __PMUS_H */ diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 8d823bc906e6..9cee7bb7a561 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -20,6 +20,7 @@ #include "metricgroup.h" #include "parse-events.h" #include "pmu.h" +#include "pmus.h" #include "print-events.h" #include "probe-file.h" #include "string2.h" @@ -271,7 +272,7 @@ int print_hwcache_events(const struct print_callbacks *print_cb, void *print_sta struct perf_pmu *pmu = NULL; const char *event_type_descriptor = event_type_descriptors[PERF_TYPE_HW_CACHE]; - while ((pmu = perf_pmu__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { /* * Skip uncore PMUs for performance. PERF_TYPE_HW_CACHE type * attributes can accept software PMUs in the extended type, so @@ -404,7 +405,7 @@ void print_events(const struct print_callbacks *print_cb, void *print_state) print_hwcache_events(print_cb, print_state); - print_pmu_events(print_cb, print_state); + perf_pmus__print_pmu_events(print_cb, print_state); print_cb->print_event(print_state, /*topic=*/NULL, diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index b27b27086422..7173f6fcdc11 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -22,6 +22,7 @@ #include "util/bpf-filter.h" #include "util/env.h" #include "util/pmu.h" +#include "util/pmus.h" #include #include "util.h" @@ -102,7 +103,7 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, return EOF; } -bool perf_pmu__has_hybrid(void) +bool perf_pmus__has_hybrid(void) { return false; } diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index a3e184e0b5ba..7ca69151136b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -20,6 +20,7 @@ #include "util.h" #include "iostat.h" #include "pmu.h" +#include "pmus.h" #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" @@ -695,7 +696,7 @@ static bool evlist__has_hybrid(struct evlist *evlist) { struct evsel *evsel; - if (!perf_pmu__has_hybrid()) + if (!perf_pmus__has_hybrid()) return false; evlist__for_each_entry(evlist, evsel) { -- cgit v1.2.3 From 15c57a8037c9683fb5c09ecc576a333c02d6f105 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:04 -0700 Subject: perf pmus: Split pmus list into core and other Split the pmus list into core and other. This will later allow for the core and other pmus to be populated separately. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-29-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 52 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 58ff7937e9b7..4ef4fecd335f 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -12,13 +12,19 @@ #include "pmu.h" #include "print-events.h" -static LIST_HEAD(pmus); +static LIST_HEAD(core_pmus); +static LIST_HEAD(other_pmus); void perf_pmus__destroy(void) { struct perf_pmu *pmu, *tmp; - list_for_each_entry_safe(pmu, tmp, &pmus, list) { + list_for_each_entry_safe(pmu, tmp, &core_pmus, list) { + list_del(&pmu->list); + + perf_pmu__delete(pmu); + } + list_for_each_entry_safe(pmu, tmp, &other_pmus, list) { list_del(&pmu->list); perf_pmu__delete(pmu); @@ -29,7 +35,12 @@ static struct perf_pmu *pmu_find(const char *name) { struct perf_pmu *pmu; - list_for_each_entry(pmu, &pmus, list) { + list_for_each_entry(pmu, &core_pmus, list) { + if (!strcmp(pmu->name, name) || + (pmu->alias_name && !strcmp(pmu->alias_name, name))) + return pmu; + } + list_for_each_entry(pmu, &other_pmus, list) { if (!strcmp(pmu->name, name) || (pmu->alias_name && !strcmp(pmu->alias_name, name))) return pmu; @@ -53,7 +64,7 @@ struct perf_pmu *perf_pmus__find(const char *name) return pmu; dirfd = perf_pmu__event_source_devices_fd(); - pmu = perf_pmu__lookup(&pmus, dirfd, name); + pmu = perf_pmu__lookup(is_pmu_core(name) ? &core_pmus : &other_pmus, dirfd, name); close(dirfd); return pmu; @@ -72,7 +83,7 @@ static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) if (pmu) return pmu; - return perf_pmu__lookup(&pmus, dirfd, name); + return perf_pmu__lookup(is_pmu_core(name) ? &core_pmus : &other_pmus, dirfd, name); } /* Add all pmus in sysfs to pmu list: */ @@ -93,7 +104,7 @@ static void pmu_read_sysfs(void) while ((dent = readdir(dir))) { if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) continue; - /* add to static LIST_HEAD(pmus): */ + /* add to static LIST_HEAD(core_pmus) or LIST_HEAD(other_pmus): */ perf_pmu__find2(fd, dent->d_name); } @@ -104,24 +115,37 @@ struct perf_pmu *perf_pmus__find_by_type(unsigned int type) { struct perf_pmu *pmu; - list_for_each_entry(pmu, &pmus, list) + list_for_each_entry(pmu, &core_pmus, list) { if (pmu->type == type) return pmu; - + } + list_for_each_entry(pmu, &other_pmus, list) { + if (pmu->type == type) + return pmu; + } return NULL; } +/* + * pmu iterator: If pmu is NULL, we start at the begin, otherwise return the + * next pmu. Returns NULL on end. + */ struct perf_pmu *perf_pmus__scan(struct perf_pmu *pmu) { - /* - * pmu iterator: If pmu is NULL, we start at the begin, - * otherwise return the next pmu. Returns NULL on end. - */ + bool use_core_pmus = !pmu || pmu->is_core; + if (!pmu) { pmu_read_sysfs(); - pmu = list_prepare_entry(pmu, &pmus, list); + pmu = list_prepare_entry(pmu, &core_pmus, list); + } + if (use_core_pmus) { + list_for_each_entry_continue(pmu, &core_pmus, list) + return pmu; + + pmu = NULL; + pmu = list_prepare_entry(pmu, &other_pmus, list); } - list_for_each_entry_continue(pmu, &pmus, list) + list_for_each_entry_continue(pmu, &other_pmus, list) return pmu; return NULL; } -- cgit v1.2.3 From 9d6a1df9b2eef52ad03a594b1237a16dbbe34e83 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:05 -0700 Subject: perf pmus: Allow just core PMU scanning Scanning all PMUs is expensive as all PMUs sysfs entries are loaded, benchmarking shows more than 4x the cost: ``` $ perf bench internals pmu-scan -i 1000 Computing performance of sysfs PMU event scan for 1000 times Average core PMU scanning took: 989.231 usec (+- 1.535 usec) Average PMU scanning took: 4309.425 usec (+- 74.322 usec) ``` Add new perf_pmus__scan_core routine that scans just core PMUs. Replace perf_pmus__scan calls with perf_pmus__scan_core when non-core PMUs are being ignored. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-30-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/pmu.c | 5 +--- tools/perf/arch/x86/util/evlist.c | 5 +--- tools/perf/arch/x86/util/perf_regs.c | 8 +++--- tools/perf/bench/pmu-scan.c | 50 +++++++++++++++++++++--------------- tools/perf/tests/pmu-events.c | 5 +--- tools/perf/util/cputopo.c | 12 +++------ tools/perf/util/header.c | 5 +--- tools/perf/util/mem-events.c | 14 +++------- tools/perf/util/parse-events.c | 13 +++------- tools/perf/util/pmu.c | 10 -------- tools/perf/util/pmu.h | 2 -- tools/perf/util/pmus.c | 30 ++++++++++++++++------ tools/perf/util/pmus.h | 1 + tools/perf/util/print-events.c | 11 ++++---- 14 files changed, 75 insertions(+), 96 deletions(-) diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c index 2504d43a39a7..561de0cb6b95 100644 --- a/tools/perf/arch/arm64/util/pmu.c +++ b/tools/perf/arch/arm64/util/pmu.c @@ -11,10 +11,7 @@ static struct perf_pmu *pmu__find_core_pmu(void) { struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmus__scan(pmu))) { - if (!is_pmu_core(pmu->name)) - continue; - + while ((pmu = perf_pmus__scan_core(pmu))) { /* * The cpumap should cover all CPUs. Otherwise, some CPUs may * not support some events or have different event IDs. diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c index 03240c640c7f..8a6a0b98b976 100644 --- a/tools/perf/arch/x86/util/evlist.c +++ b/tools/perf/arch/x86/util/evlist.c @@ -33,13 +33,10 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, continue; } - while ((pmu = perf_pmus__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { struct perf_cpu_map *cpus; struct evsel *evsel; - if (!pmu->is_core) - continue; - evsel = evsel__new(attrs + i); if (evsel == NULL) goto out_delete_partial_list; diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index befa7f3659b9..116384f19baf 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -300,11 +300,9 @@ uint64_t arch__intr_reg_mask(void) * The same register set is supported among different hybrid PMUs. * Only check the first available one. */ - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (pmu->is_core) { - type = pmu->type; - break; - } + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + type = pmu->type; + break; } attr.config |= type << PERF_PMU_TYPE_SHIFT; } diff --git a/tools/perf/bench/pmu-scan.c b/tools/perf/bench/pmu-scan.c index 51cae2d03353..c7d207f8e13c 100644 --- a/tools/perf/bench/pmu-scan.c +++ b/tools/perf/bench/pmu-scan.c @@ -22,6 +22,7 @@ struct pmu_scan_result { int nr_aliases; int nr_formats; int nr_caps; + bool is_core; }; static const struct option options[] = { @@ -53,6 +54,7 @@ static int save_result(void) r = results + nr_pmus; r->name = strdup(pmu->name); + r->is_core = pmu->is_core; r->nr_caps = pmu->nr_caps; r->nr_aliases = 0; @@ -72,7 +74,7 @@ static int save_result(void) return 0; } -static int check_result(void) +static int check_result(bool core_only) { struct pmu_scan_result *r; struct perf_pmu *pmu; @@ -81,6 +83,9 @@ static int check_result(void) for (int i = 0; i < nr_pmus; i++) { r = &results[i]; + if (core_only && !r->is_core) + continue; + pmu = perf_pmus__find(r->name); if (pmu == NULL) { pr_err("Cannot find PMU %s\n", r->name); @@ -130,7 +135,6 @@ static int run_pmu_scan(void) struct timeval start, end, diff; double time_average, time_stddev; u64 runtime_us; - unsigned int i; int ret; init_stats(&stats); @@ -142,26 +146,30 @@ static int run_pmu_scan(void) return -1; } - for (i = 0; i < iterations; i++) { - gettimeofday(&start, NULL); - perf_pmus__scan(NULL); - gettimeofday(&end, NULL); - - timersub(&end, &start, &diff); - runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; - update_stats(&stats, runtime_us); - - ret = check_result(); - perf_pmus__destroy(); - if (ret < 0) - break; + for (int j = 0; j < 2; j++) { + bool core_only = (j == 0); + + for (unsigned int i = 0; i < iterations; i++) { + gettimeofday(&start, NULL); + if (core_only) + perf_pmus__scan_core(NULL); + else + perf_pmus__scan(NULL); + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&stats, runtime_us); + + ret = check_result(core_only); + perf_pmus__destroy(); + if (ret < 0) + break; + } + time_average = avg_stats(&stats); + time_stddev = stddev_stats(&stats); + pr_info(" Average%s PMU scanning took: %.3f usec (+- %.3f usec)\n", + core_only ? " core" : "", time_average, time_stddev); } - - time_average = avg_stats(&stats); - time_stddev = stddev_stats(&stats); - pr_info(" Average PMU scanning took: %.3f usec (+- %.3f usec)\n", - time_average, time_stddev); - delete_result(); return 0; } diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index 64ecb7845af4..64383fc34ef1 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -709,12 +709,9 @@ static int test__aliases(struct test_suite *test __maybe_unused, struct perf_pmu *pmu = NULL; unsigned long i; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { int count = 0; - if (!is_pmu_core(pmu->name)) - continue; - if (list_empty(&pmu->format)) { pr_debug2("skipping testing core PMU %s\n", pmu->name); continue; diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index 4578c26747e1..729142ec9a9a 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -477,10 +477,9 @@ struct hybrid_topology *hybrid_topology__new(void) if (!perf_pmus__has_hybrid()) return NULL; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (pmu->is_core) - nr++; - } + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) + nr++; + if (nr == 0) return NULL; @@ -489,10 +488,7 @@ struct hybrid_topology *hybrid_topology__new(void) return NULL; tp->nr = nr; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (!pmu->is_core) - continue; - + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { if (load_hybrid_node(&tp->nodes[i], pmu)) { hybrid_topology__delete(tp); return NULL; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index e6d8ecd7a08e..2dde3ca20de5 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1607,10 +1607,7 @@ static int write_pmu_caps(struct feat_fd *ff, */ if (perf_pmus__has_hybrid()) { pmu = NULL; - while ((pmu = perf_pmus__scan(pmu))) { - if (!pmu->is_core) - continue; - + while ((pmu = perf_pmus__scan_core(pmu))) { ret = __write_pmu_caps(ff, pmu, true); if (ret < 0) return ret; diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 08ac3ea2e366..c5596230a308 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -136,10 +136,7 @@ int perf_mem_events__init(void) } else { struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (!pmu->is_core) - continue; - + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); e->supported |= perf_mem_event__supported(mnt, sysfs_name); @@ -176,10 +173,7 @@ static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, char sysfs_name[100]; struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (!pmu->is_core) - continue; - + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); if (!perf_mem_event__supported(mnt, sysfs_name)) { @@ -217,9 +211,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, return -1; } - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (!pmu->is_core) - continue; + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { rec_argv[i++] = "-e"; s = perf_mem_events__name(j, pmu->name); if (s) { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index be544f948be2..e0c3f2037477 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -453,15 +453,12 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, const char *config_name = get_config_name(head_config); const char *metric_id = get_config_metric_id(head_config); - while ((pmu = perf_pmus__scan(pmu)) != NULL) { + /* Legacy cache events are only supported by core PMUs. */ + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { LIST_HEAD(config_terms); struct perf_event_attr attr; int ret; - /* Skip unsupported PMUs. */ - if (!perf_pmu__supports_legacy_cache(pmu)) - continue; - if (parse_events__filter_pmu(parse_state, pmu)) continue; @@ -1481,12 +1478,10 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, return __parse_events_add_numeric(parse_state, list, /*pmu=*/NULL, type, config, head_config); - while ((pmu = perf_pmus__scan(pmu)) != NULL) { + /* Wildcards on numeric values are only supported by core PMUs. */ + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { int ret; - if (!perf_pmu__supports_wildcard_numeric(pmu)) - continue; - if (parse_events__filter_pmu(parse_state, pmu)) continue; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 05056305fb58..7102084dd3aa 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1427,21 +1427,11 @@ bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) return pmu->is_core; } -bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu) -{ - return pmu->is_core; -} - bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) { return !is_pmu_hybrid(pmu->name); } -bool perf_pmu__is_mem_pmu(const struct perf_pmu *pmu) -{ - return pmu->is_core; -} - bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name) { struct perf_pmu_alias *alias; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index f1f3e8a2e00e..02fec0a7d4c8 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -223,9 +223,7 @@ void perf_pmu__del_formats(struct list_head *formats); bool is_pmu_core(const char *name); bool is_pmu_hybrid(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); -bool perf_pmu__supports_wildcard_numeric(const struct perf_pmu *pmu); bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); -bool perf_pmu__is_mem_pmu(const struct perf_pmu *pmu); bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name); FILE *perf_pmu__open_file(struct perf_pmu *pmu, const char *name); diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 4ef4fecd335f..de7fc36519c9 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -87,7 +87,7 @@ static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) } /* Add all pmus in sysfs to pmu list: */ -static void pmu_read_sysfs(void) +static void pmu_read_sysfs(bool core_only) { int fd; DIR *dir; @@ -104,6 +104,8 @@ static void pmu_read_sysfs(void) while ((dent = readdir(dir))) { if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) continue; + if (core_only && !is_pmu_core(dent->d_name)) + continue; /* add to static LIST_HEAD(core_pmus) or LIST_HEAD(other_pmus): */ perf_pmu__find2(fd, dent->d_name); } @@ -135,7 +137,7 @@ struct perf_pmu *perf_pmus__scan(struct perf_pmu *pmu) bool use_core_pmus = !pmu || pmu->is_core; if (!pmu) { - pmu_read_sysfs(); + pmu_read_sysfs(/*core_only=*/false); pmu = list_prepare_entry(pmu, &core_pmus, list); } if (use_core_pmus) { @@ -150,6 +152,18 @@ struct perf_pmu *perf_pmus__scan(struct perf_pmu *pmu) return NULL; } +struct perf_pmu *perf_pmus__scan_core(struct perf_pmu *pmu) +{ + if (!pmu) { + pmu_read_sysfs(/*core_only=*/true); + pmu = list_prepare_entry(pmu, &core_pmus, list); + } + list_for_each_entry_continue(pmu, &core_pmus, list) + return pmu; + + return NULL; +} + const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str) { struct perf_pmu *pmu = NULL; @@ -176,10 +190,10 @@ int perf_pmus__num_mem_pmus(void) struct perf_pmu *pmu = NULL; int count = 0; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (perf_pmu__is_mem_pmu(pmu)) - count++; - } + /* All core PMUs are for mem events. */ + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) + count++; + return count; } @@ -421,8 +435,8 @@ bool perf_pmus__has_hybrid(void) if (!hybrid_scanned) { struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (pmu->is_core && is_pmu_hybrid(pmu->name)) { + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + if (is_pmu_hybrid(pmu->name)) { has_hybrid = true; break; } diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 2a771d9f8da7..9de0222ed52b 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -11,6 +11,7 @@ struct perf_pmu *perf_pmus__find(const char *name); struct perf_pmu *perf_pmus__find_by_type(unsigned int type); struct perf_pmu *perf_pmus__scan(struct perf_pmu *pmu); +struct perf_pmu *perf_pmus__scan_core(struct perf_pmu *pmu); const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 9cee7bb7a561..7a5f87392720 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -272,12 +272,11 @@ int print_hwcache_events(const struct print_callbacks *print_cb, void *print_sta struct perf_pmu *pmu = NULL; const char *event_type_descriptor = event_type_descriptors[PERF_TYPE_HW_CACHE]; - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - /* - * Skip uncore PMUs for performance. PERF_TYPE_HW_CACHE type - * attributes can accept software PMUs in the extended type, so - * also skip. - */ + /* + * Only print core PMUs, skipping uncore for performance and + * PERF_TYPE_SOFTWARE that can succeed in opening legacy cache evenst. + */ + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { if (pmu->is_uncore || pmu->type == PERF_TYPE_SOFTWARE) continue; -- cgit v1.2.3 From 8e7d8a2eef3e48223a46e3ba676ce01a881a8519 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:06 -0700 Subject: perf pmus: Avoid repeated sysfs scanning perf_pmus__scan will process every directory in sysfs to see if it is a PMU, attempting to add it if not already in the pmus list. Add two booleans to record whether this scanning has been done for core or all PMUs. Skip scanning in the event that scanning has already occurred. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-31-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index de7fc36519c9..2c512345191d 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -14,6 +14,8 @@ static LIST_HEAD(core_pmus); static LIST_HEAD(other_pmus); +static bool read_sysfs_core_pmus; +static bool read_sysfs_all_pmus; void perf_pmus__destroy(void) { @@ -29,6 +31,8 @@ void perf_pmus__destroy(void) perf_pmu__delete(pmu); } + read_sysfs_core_pmus = false; + read_sysfs_all_pmus = false; } static struct perf_pmu *pmu_find(const char *name) @@ -53,6 +57,7 @@ struct perf_pmu *perf_pmus__find(const char *name) { struct perf_pmu *pmu; int dirfd; + bool core_pmu; /* * Once PMU is loaded it stays in the list, @@ -63,8 +68,15 @@ struct perf_pmu *perf_pmus__find(const char *name) if (pmu) return pmu; + if (read_sysfs_all_pmus) + return NULL; + + core_pmu = is_pmu_core(name); + if (core_pmu && read_sysfs_core_pmus) + return NULL; + dirfd = perf_pmu__event_source_devices_fd(); - pmu = perf_pmu__lookup(is_pmu_core(name) ? &core_pmus : &other_pmus, dirfd, name); + pmu = perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name); close(dirfd); return pmu; @@ -73,6 +85,7 @@ struct perf_pmu *perf_pmus__find(const char *name) static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) { struct perf_pmu *pmu; + bool core_pmu; /* * Once PMU is loaded it stays in the list, @@ -83,7 +96,14 @@ static struct perf_pmu *perf_pmu__find2(int dirfd, const char *name) if (pmu) return pmu; - return perf_pmu__lookup(is_pmu_core(name) ? &core_pmus : &other_pmus, dirfd, name); + if (read_sysfs_all_pmus) + return NULL; + + core_pmu = is_pmu_core(name); + if (core_pmu && read_sysfs_core_pmus) + return NULL; + + return perf_pmu__lookup(core_pmu ? &core_pmus : &other_pmus, dirfd, name); } /* Add all pmus in sysfs to pmu list: */ @@ -93,6 +113,9 @@ static void pmu_read_sysfs(bool core_only) DIR *dir; struct dirent *dent; + if (read_sysfs_all_pmus || (core_only && read_sysfs_core_pmus)) + return; + fd = perf_pmu__event_source_devices_fd(); if (fd < 0) return; @@ -111,6 +134,12 @@ static void pmu_read_sysfs(bool core_only) } closedir(dir); + if (core_only) { + read_sysfs_core_pmus = true; + } else { + read_sysfs_core_pmus = true; + read_sysfs_all_pmus = true; + } } struct perf_pmu *perf_pmus__find_by_type(unsigned int type) -- cgit v1.2.3 From 1dd5f78d8337a7a69c9b76886a82e87524e56a51 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:07 -0700 Subject: perf pmus: Ensure all PMUs are read for find_by_type perf_pmus__find_by_type may be called for something like a raw event, in which case the PMU isn't guaranteed to have been looked up. Add a second check to make sure all PMUs are loaded. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-32-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 2c512345191d..6ecccb5ad03e 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -142,7 +142,7 @@ static void pmu_read_sysfs(bool core_only) } } -struct perf_pmu *perf_pmus__find_by_type(unsigned int type) +static struct perf_pmu *__perf_pmus__find_by_type(unsigned int type) { struct perf_pmu *pmu; @@ -150,6 +150,7 @@ struct perf_pmu *perf_pmus__find_by_type(unsigned int type) if (pmu->type == type) return pmu; } + list_for_each_entry(pmu, &other_pmus, list) { if (pmu->type == type) return pmu; @@ -157,6 +158,18 @@ struct perf_pmu *perf_pmus__find_by_type(unsigned int type) return NULL; } +struct perf_pmu *perf_pmus__find_by_type(unsigned int type) +{ + struct perf_pmu *pmu = __perf_pmus__find_by_type(type); + + if (pmu || read_sysfs_all_pmus) + return pmu; + + pmu_read_sysfs(/*core_only=*/false); + pmu = __perf_pmus__find_by_type(type); + return pmu; +} + /* * pmu iterator: If pmu is NULL, we start at the begin, otherwise return the * next pmu. Returns NULL on end. -- cgit v1.2.3 From 002c4845758e87efee8ec6ba6e6f9f1bcf0c3330 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:08 -0700 Subject: perf pmus: Add function to return count of core PMUs Add perf_pmus__num_core_pmus that will count core PMUs holding the result in a static. Reuse for perf_pmus__num_mem_pmus. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-33-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 21 ++++++++++++++------- tools/perf/util/pmus.h | 1 + 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 6ecccb5ad03e..bf927aed162e 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -229,14 +229,8 @@ const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str) int perf_pmus__num_mem_pmus(void) { - struct perf_pmu *pmu = NULL; - int count = 0; - /* All core PMUs are for mem events. */ - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) - count++; - - return count; + return perf_pmus__num_core_pmus(); } /** Struct for ordering events as output in perf list. */ @@ -488,6 +482,19 @@ bool perf_pmus__has_hybrid(void) return has_hybrid; } +int perf_pmus__num_core_pmus(void) +{ + static int count; + + if (!count) { + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) + count++; + } + return count; +} + struct perf_pmu *evsel__find_pmu(const struct evsel *evsel) { struct perf_pmu *pmu = evsel->pmu; diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 9de0222ed52b..27400a027d41 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -19,5 +19,6 @@ int perf_pmus__num_mem_pmus(void); void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool perf_pmus__have_event(const char *pname, const char *name); bool perf_pmus__has_hybrid(void); +int perf_pmus__num_core_pmus(void); #endif /* __PMUS_H */ -- cgit v1.2.3 From 94f9eb95d954bee0149fd1ce84c239c9e09ae9d8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:09 -0700 Subject: perf pmus: Remove perf_pmus__has_hybrid perf_pmus__has_hybrid was used to detect when there was >1 core PMU, this can be achieved with perf_pmus__num_core_pmus that doesn't depend upon is_pmu_hybrid and PMU name comparisons. When modifying the function calls take the opportunity to improve comments, enable/simplify tests that were previously failing for hybrid but now pass and to simplify generic code. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-34-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/hybrid.c | 2 +- tools/perf/arch/x86/util/evlist.c | 2 +- tools/perf/arch/x86/util/perf_regs.c | 2 +- tools/perf/builtin-record.c | 4 ++-- tools/perf/tests/attr.c | 9 ++++++++- tools/perf/tests/parse-metric.c | 7 ++----- tools/perf/tests/switch-tracking.c | 12 +----------- tools/perf/tests/topology.c | 14 ++------------ tools/perf/util/cputopo.c | 10 ++-------- tools/perf/util/evsel.c | 2 +- tools/perf/util/header.c | 2 +- tools/perf/util/mem-events.c | 18 +++++------------- tools/perf/util/metricgroup.c | 2 +- tools/perf/util/pmus.c | 18 ------------------ tools/perf/util/pmus.h | 1 - tools/perf/util/python.c | 4 ++-- tools/perf/util/stat-display.c | 2 +- 17 files changed, 31 insertions(+), 80 deletions(-) diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index e466735d68d5..eb152770f148 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -281,7 +281,7 @@ static int test_events(const struct evlist_test *events, int cnt) int test__hybrid(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - if (!perf_pmus__has_hybrid()) + if (perf_pmus__num_core_pmus() == 1) return TEST_SKIP; return test_events(test__hybrid_events, ARRAY_SIZE(test__hybrid_events)); diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c index 8a6a0b98b976..cbd582182932 100644 --- a/tools/perf/arch/x86/util/evlist.c +++ b/tools/perf/arch/x86/util/evlist.c @@ -18,7 +18,7 @@ static int ___evlist__add_default_attrs(struct evlist *evlist, for (i = 0; i < nr_attrs; i++) event_attr_init(attrs + i); - if (!perf_pmus__has_hybrid()) + if (perf_pmus__num_core_pmus() == 1) return evlist__add_attrs(evlist, attrs, nr_attrs); for (i = 0; i < nr_attrs; i++) { diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index 116384f19baf..8ad4112ad10c 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -292,7 +292,7 @@ uint64_t arch__intr_reg_mask(void) */ attr.sample_period = 1; - if (perf_pmus__has_hybrid()) { + if (perf_pmus__num_core_pmus() > 1) { struct perf_pmu *pmu = NULL; __u64 type = PERF_TYPE_RAW; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 4b9212f75493..aec18db7ff23 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1294,7 +1294,7 @@ static int record__open(struct record *rec) * of waiting or event synthesis. */ if (opts->target.initial_delay || target__has_cpu(&opts->target) || - perf_pmus__has_hybrid()) { + perf_pmus__num_core_pmus() > 1) { pos = evlist__get_tracking_event(evlist); if (!evsel__is_dummy_event(pos)) { /* Set up dummy event. */ @@ -2193,7 +2193,7 @@ static void record__uniquify_name(struct record *rec) char *new_name; int ret; - if (!perf_pmus__has_hybrid()) + if (perf_pmus__num_core_pmus() == 1) return; evlist__for_each_entry(evlist, pos) { diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index 674876e6c8e6..61186d0d1cfa 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -185,8 +185,15 @@ static int test__attr(struct test_suite *test __maybe_unused, int subtest __mayb char path_dir[PATH_MAX]; char *exec_path; - if (perf_pmus__has_hybrid()) + if (perf_pmus__num_core_pmus() > 1) { + /* + * TODO: Attribute tests hard code the PMU type. If there are >1 + * core PMU then each PMU will have a different type whic + * requires additional support. + */ + pr_debug("Skip test on hybrid systems"); return TEST_SKIP; + } /* First try development tree tests. */ if (!lstat("./tests", &st)) diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c index 1d6493a5a956..2c28fb50dc24 100644 --- a/tools/perf/tests/parse-metric.c +++ b/tools/perf/tests/parse-metric.c @@ -302,11 +302,8 @@ static int test__parse_metric(struct test_suite *test __maybe_unused, int subtes TEST_ASSERT_VAL("DCache_L2 failed", test_dcache_l2() == 0); TEST_ASSERT_VAL("recursion fail failed", test_recursion_fail() == 0); TEST_ASSERT_VAL("Memory bandwidth", test_memory_bandwidth() == 0); - - if (!perf_pmus__has_hybrid()) { - TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0); - TEST_ASSERT_VAL("test metric group", test_metric_group() == 0); - } + TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0); + TEST_ASSERT_VAL("test metric group", test_metric_group() == 0); return 0; } diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index cff6ab87b2f6..e52b031bedc5 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -375,17 +375,7 @@ static int test__switch_tracking(struct test_suite *test __maybe_unused, int sub cpu_clocks_evsel = evlist__last(evlist); /* Second event */ - if (perf_pmus__has_hybrid()) { - cycles = "cpu_core/cycles/u"; - err = parse_event(evlist, cycles); - if (err) { - cycles = "cpu_atom/cycles/u"; - pr_debug("Trying %s\n", cycles); - err = parse_event(evlist, cycles); - } - } else { - err = parse_event(evlist, cycles); - } + err = parse_event(evlist, cycles); if (err) { pr_debug("Failed to parse event %s\n", cycles); goto out_err; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 49e80d15420b..9dee63734e66 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -41,18 +41,8 @@ static int session_write_header(char *path) session = perf_session__new(&data, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); - if (!perf_pmus__has_hybrid()) { - session->evlist = evlist__new_default(); - TEST_ASSERT_VAL("can't get evlist", session->evlist); - } else { - struct parse_events_error err; - - session->evlist = evlist__new(); - TEST_ASSERT_VAL("can't get evlist", session->evlist); - parse_events_error__init(&err); - parse_events(session->evlist, "cpu_core/cycles/", &err); - parse_events_error__exit(&err); - } + session->evlist = evlist__new_default(); + TEST_ASSERT_VAL("can't get evlist", session->evlist); perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY); perf_header__set_feat(&session->header, HEADER_NRCPUS); diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index 729142ec9a9a..81cfc85f4668 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -472,15 +472,9 @@ struct hybrid_topology *hybrid_topology__new(void) { struct perf_pmu *pmu = NULL; struct hybrid_topology *tp = NULL; - u32 nr = 0, i = 0; + int nr = perf_pmus__num_core_pmus(), i = 0; - if (!perf_pmus__has_hybrid()) - return NULL; - - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) - nr++; - - if (nr == 0) + if (nr <= 1) return NULL; tp = zalloc(sizeof(*tp) + sizeof(tp->nodes[0]) * nr); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index b4237fc713d5..ec2ce39d66d8 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -3140,7 +3140,7 @@ void evsel__zero_per_pkg(struct evsel *evsel) */ bool evsel__is_hybrid(const struct evsel *evsel) { - if (!perf_pmus__has_hybrid()) + if (perf_pmus__num_core_pmus() == 1) return false; return evsel->core.is_pmu_core; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 2dde3ca20de5..0c69109c0a3b 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1605,7 +1605,7 @@ static int write_pmu_caps(struct feat_fd *ff, * Write hybrid pmu caps first to maintain compatibility with * older perf tool. */ - if (perf_pmus__has_hybrid()) { + if (perf_pmus__num_core_pmus() > 1) { pmu = NULL; while ((pmu = perf_pmus__scan_core(pmu))) { ret = __write_pmu_caps(ff, pmu, true); diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index c5596230a308..be15aadb6b14 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -121,6 +121,7 @@ int perf_mem_events__init(void) for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { struct perf_mem_event *e = perf_mem_events__ptr(j); char sysfs_name[100]; + struct perf_pmu *pmu = NULL; /* * If the event entry isn't valid, skip initialization @@ -129,18 +130,9 @@ int perf_mem_events__init(void) if (!e->tag) continue; - if (!perf_pmus__has_hybrid()) { - scnprintf(sysfs_name, sizeof(sysfs_name), - e->sysfs_name, "cpu"); - e->supported = perf_mem_event__supported(mnt, sysfs_name); - } else { - struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { - scnprintf(sysfs_name, sizeof(sysfs_name), - e->sysfs_name, pmu->name); - e->supported |= perf_mem_event__supported(mnt, sysfs_name); - } + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); + e->supported |= perf_mem_event__supported(mnt, sysfs_name); } if (e->supported) @@ -196,7 +188,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, if (!e->record) continue; - if (!perf_pmus__has_hybrid()) { + if (perf_pmus__num_core_pmus() == 1) { if (!e->supported) { pr_err("failed: event '%s' not supported\n", perf_mem_events__name(j, NULL)); diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 092ed6386a39..70ef2e23a710 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -274,7 +274,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids, const char *metric_id; struct evsel *ev; size_t ids_size, matched_events, i; - bool all_pmus = !strcmp(pmu, "all") || !perf_pmus__has_hybrid() || !is_pmu_hybrid(pmu); + bool all_pmus = !strcmp(pmu, "all") || perf_pmus__num_core_pmus() == 1 || !is_pmu_core(pmu); *out_metric_events = NULL; ids_size = hashmap__size(ids); diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index bf927aed162e..53f11f6ce878 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -464,24 +464,6 @@ bool perf_pmus__have_event(const char *pname, const char *name) return pmu && perf_pmu__have_event(pmu, name); } -bool perf_pmus__has_hybrid(void) -{ - static bool hybrid_scanned, has_hybrid; - - if (!hybrid_scanned) { - struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { - if (is_pmu_hybrid(pmu->name)) { - has_hybrid = true; - break; - } - } - hybrid_scanned = true; - } - return has_hybrid; -} - int perf_pmus__num_core_pmus(void) { static int count; diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 27400a027d41..1e710720aec7 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -18,7 +18,6 @@ const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str); int perf_pmus__num_mem_pmus(void); void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool perf_pmus__have_event(const char *pname, const char *name); -bool perf_pmus__has_hybrid(void); int perf_pmus__num_core_pmus(void); #endif /* __PMUS_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 7173f6fcdc11..8de1b759bbaa 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -103,9 +103,9 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, return EOF; } -bool perf_pmus__has_hybrid(void) +int perf_pmus__num_core_pmus(void) { - return false; + return 1; } bool evsel__is_aux_event(const struct evsel *evsel __maybe_unused) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 7ca69151136b..a2bbdc25d979 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -696,7 +696,7 @@ static bool evlist__has_hybrid(struct evlist *evlist) { struct evsel *evsel; - if (!perf_pmus__has_hybrid()) + if (perf_pmus__num_core_pmus() == 1) return false; evlist__for_each_entry(evlist, evsel) { -- cgit v1.2.3 From 6b9da260703096b366ec0fe78d87053e8f577776 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:10 -0700 Subject: perf pmu: Remove is_pmu_hybrid Users have been removed or switched to using pmu->is_core with perf_pmus__num_core_pmus() > 1. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-35-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 7 +------ tools/perf/util/pmu.h | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7102084dd3aa..0520aa9fe991 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1417,11 +1417,6 @@ bool is_pmu_core(const char *name) return !strcmp(name, "cpu") || is_sysfs_pmu_core(name); } -bool is_pmu_hybrid(const char *name) -{ - return !strcmp(name, "cpu_atom") || !strcmp(name, "cpu_core"); -} - bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) { return pmu->is_core; @@ -1429,7 +1424,7 @@ bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) { - return !is_pmu_hybrid(pmu->name); + return pmu->is_core && perf_pmus__num_core_pmus() > 1; } bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name) diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 02fec0a7d4c8..287f593b15c7 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -221,7 +221,6 @@ int perf_pmu__format_parse(int dirfd, struct list_head *head); void perf_pmu__del_formats(struct list_head *formats); bool is_pmu_core(const char *name); -bool is_pmu_hybrid(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name); -- cgit v1.2.3 From a90cc5a9eeab45eaf9e47740366b8cf98c3aeb83 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 12:44:41 -0700 Subject: perf evsel: Don't let evsel__group_pmu_name() traverse unsorted group Previously the evsel__group_pmu_name would iterate the evsel's group, however, the list of evsels aren't yet sorted and so the loop may terminate prematurely. It is also not desirable to iterate the list of evsels during list_sort as the list may be broken. Precompute the group_pmu_name for the evsel before sorting, as part of the computation and only if necessary, iterate the whole list looking for group members so that being sorted isn't necessary. Move the group pmu name computation to parse-events.c given the closer dependency on the behavior of parse_events__sort_events_and_fix_groups. Fixes: 7abf0bccaaec7704 ("perf evsel: Add function to compute group PMU name") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rob Herring Cc: Sandipan Das Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230526194442.2355872-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 31 +++++-------------- tools/perf/util/evsel.h | 2 +- tools/perf/util/parse-events.c | 70 +++++++++++++++++++++++++++++++++++------- 3 files changed, 67 insertions(+), 36 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ec2ce39d66d8..46da3f0bb47e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -291,6 +291,7 @@ void evsel__init(struct evsel *evsel, evsel->per_pkg_mask = NULL; evsel->collect_stat = false; evsel->pmu_name = NULL; + evsel->group_pmu_name = NULL; evsel->skippable = false; } @@ -391,6 +392,11 @@ struct evsel *evsel__clone(struct evsel *orig) if (evsel->pmu_name == NULL) goto out_err; } + if (orig->group_pmu_name) { + evsel->group_pmu_name = strdup(orig->group_pmu_name); + if (evsel->group_pmu_name == NULL) + goto out_err; + } if (orig->filter) { evsel->filter = strdup(orig->filter); if (evsel->filter == NULL) @@ -787,30 +793,6 @@ bool evsel__name_is(struct evsel *evsel, const char *name) return !strcmp(evsel__name(evsel), name); } -const char *evsel__group_pmu_name(const struct evsel *evsel) -{ - struct evsel *leader = evsel__leader(evsel); - struct evsel *pos; - - /* - * Software events may be in a group with other uncore PMU events. Use - * the pmu_name of the first non-software event to avoid breaking the - * software event out of the group. - * - * Aux event leaders, like intel_pt, expect a group with events from - * other PMUs, so substitute the AUX event's PMU in this case. - */ - if (evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) { - /* Starting with the leader, find the first event with a named PMU. */ - for_each_group_evsel(pos, leader) { - if (pos->pmu_name) - return pos->pmu_name; - } - } - - return evsel->pmu_name ?: "cpu"; -} - const char *evsel__metric_id(const struct evsel *evsel) { if (evsel->metric_id) @@ -1492,6 +1474,7 @@ void evsel__exit(struct evsel *evsel) zfree(&evsel->group_name); zfree(&evsel->name); zfree(&evsel->pmu_name); + zfree(&evsel->group_pmu_name); zfree(&evsel->unit); zfree(&evsel->metric_id); evsel__zero_per_pkg(evsel); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 429b172cc94d..6d9536ecbc7b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -72,6 +72,7 @@ struct evsel { char *name; char *group_name; const char *pmu_name; + const char *group_pmu_name; #ifdef HAVE_LIBTRACEEVENT struct tep_event *tp_format; #endif @@ -287,7 +288,6 @@ int arch_evsel__hw_name(struct evsel *evsel, char *bf, size_t size); int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size); const char *evsel__name(struct evsel *evsel); bool evsel__name_is(struct evsel *evsel, const char *name); -const char *evsel__group_pmu_name(const struct evsel *evsel); const char *evsel__metric_id(const struct evsel *evsel); static inline bool evsel__is_tool(const struct evsel *evsel) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index e0c3f2037477..7f047ac11168 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1984,6 +1984,42 @@ int parse_events_terms(struct list_head *terms, const char *str) return ret; } +static int evsel__compute_group_pmu_name(struct evsel *evsel, + const struct list_head *head) +{ + struct evsel *leader = evsel__leader(evsel); + struct evsel *pos; + const char *group_pmu_name = evsel->pmu_name ?: "cpu"; + + /* + * Software events may be in a group with other uncore PMU events. Use + * the pmu_name of the first non-software event to avoid breaking the + * software event out of the group. + * + * Aux event leaders, like intel_pt, expect a group with events from + * other PMUs, so substitute the AUX event's PMU in this case. + */ + if (evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) { + /* + * Starting with the leader, find the first event with a named + * PMU. for_each_group_(member|evsel) isn't used as the list + * isn't yet sorted putting evsel's in the same group together. + */ + if (leader->pmu_name) { + group_pmu_name = leader->pmu_name; + } else if (leader->core.nr_members > 1) { + list_for_each_entry(pos, head, core.node) { + if (evsel__leader(pos) == leader && pos->pmu_name) { + group_pmu_name = pos->pmu_name; + break; + } + } + } + } + evsel->group_pmu_name = strdup(group_pmu_name); + return evsel->group_pmu_name ? 0 : -ENOMEM; +} + __weak int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs) { /* Order by insertion index. */ @@ -2003,7 +2039,11 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list /* * First sort by grouping/leader. Read the leader idx only if the evsel - * is part of a group, as -1 indicates no group. + * is part of a group, by default ungrouped events will be sorted + * relative to grouped events based on where the first ungrouped event + * occurs. If both events don't have a group we want to fall-through to + * the arch specific sorting, that can reorder and fix things like + * Intel's topdown events. */ if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) { lhs_has_group = true; @@ -2019,8 +2059,8 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list /* Group by PMU if there is a group. Groups can't span PMUs. */ if (lhs_has_group && rhs_has_group) { - lhs_pmu_name = evsel__group_pmu_name(lhs); - rhs_pmu_name = evsel__group_pmu_name(rhs); + lhs_pmu_name = lhs->group_pmu_name; + rhs_pmu_name = rhs->group_pmu_name; ret = strcmp(lhs_pmu_name, rhs_pmu_name); if (ret) return ret; @@ -2030,13 +2070,14 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list return arch_evlist__cmp(lhs, rhs); } -static bool parse_events__sort_events_and_fix_groups(struct list_head *list) +static int parse_events__sort_events_and_fix_groups(struct list_head *list) { int idx = 0, unsorted_idx = -1; struct evsel *pos, *cur_leader = NULL; struct perf_evsel *cur_leaders_grp = NULL; bool idx_changed = false; int orig_num_leaders = 0, num_leaders = 0; + int ret; /* * Compute index to insert ungrouped events at. Place them where the @@ -2045,6 +2086,10 @@ static bool parse_events__sort_events_and_fix_groups(struct list_head *list) list_for_each_entry(pos, list, core.node) { const struct evsel *pos_leader = evsel__leader(pos); + ret = evsel__compute_group_pmu_name(pos, list); + if (ret) + return ret; + if (pos == pos_leader) orig_num_leaders++; @@ -2069,7 +2114,7 @@ static bool parse_events__sort_events_and_fix_groups(struct list_head *list) idx = 0; list_for_each_entry(pos, list, core.node) { const struct evsel *pos_leader = evsel__leader(pos); - const char *pos_pmu_name = evsel__group_pmu_name(pos); + const char *pos_pmu_name = pos->group_pmu_name; const char *cur_leader_pmu_name, *pos_leader_pmu_name; bool force_grouped = arch_evsel__must_be_in_group(pos); @@ -2086,7 +2131,7 @@ static bool parse_events__sort_events_and_fix_groups(struct list_head *list) if (!cur_leader) cur_leader = pos; - cur_leader_pmu_name = evsel__group_pmu_name(cur_leader); + cur_leader_pmu_name = cur_leader->group_pmu_name; if ((cur_leaders_grp != pos->core.leader && !force_grouped) || strcmp(cur_leader_pmu_name, pos_pmu_name)) { /* Event is for a different group/PMU than last. */ @@ -2098,7 +2143,7 @@ static bool parse_events__sort_events_and_fix_groups(struct list_head *list) */ cur_leaders_grp = pos->core.leader; } - pos_leader_pmu_name = evsel__group_pmu_name(pos_leader); + pos_leader_pmu_name = pos_leader->group_pmu_name; if (strcmp(pos_leader_pmu_name, pos_pmu_name) || force_grouped) { /* * Event's PMU differs from its leader's. Groups can't @@ -2115,7 +2160,7 @@ static bool parse_events__sort_events_and_fix_groups(struct list_head *list) num_leaders++; pos_leader->core.nr_members++; } - return idx_changed || num_leaders != orig_num_leaders; + return (idx_changed || num_leaders != orig_num_leaders) ? 1 : 0; } int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filter, @@ -2132,7 +2177,7 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte .pmu_filter = pmu_filter, .match_legacy_cache_terms = true, }; - int ret; + int ret, ret2; ret = parse_events__scanner(str, &parse_state); @@ -2141,8 +2186,11 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte return -1; } - if (parse_events__sort_events_and_fix_groups(&parse_state.list) && - warn_if_reordered && !parse_state.wild_card_pmus) + ret2 = parse_events__sort_events_and_fix_groups(&parse_state.list); + if (ret2 < 0) + return ret; + + if (ret2 && warn_if_reordered && !parse_state.wild_card_pmus) pr_warning("WARNING: events were regrouped to match PMUs\n"); /* -- cgit v1.2.3 From 797b9ec8c4bc9ec89f633a9b2c710b7b64753ca4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 12:44:42 -0700 Subject: perf evsel: Don't let for_each_group() treat the head of the list as one of its nodes Address/memory sanitizer was reporting issues in evsel__group_pmu_name because the for_each_group_evsel loop didn't terminate when the head was reached, the head would then be cast and accessed as an evsel leading to invalid memory accesses. Fix for_each_group_member and for_each_group_evsel to terminate at the list head. Note, evsel__group_pmu_name no longer iterates the group, but the problem is present regardless. Fixes: 717e263fc354d53d ("perf report: Show group description when event group is enabled") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rob Herring Cc: Sandipan Das Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230526194442.2355872-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 24 ++++++++++++++++-------- tools/perf/util/evsel_fprintf.c | 1 + 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 6d9536ecbc7b..5e8371613565 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -459,16 +459,24 @@ static inline int evsel__group_idx(struct evsel *evsel) } /* Iterates group WITHOUT the leader. */ -#define for_each_group_member(_evsel, _leader) \ -for ((_evsel) = list_entry((_leader)->core.node.next, struct evsel, core.node); \ - (_evsel) && (_evsel)->core.leader == (&_leader->core); \ - (_evsel) = list_entry((_evsel)->core.node.next, struct evsel, core.node)) +#define for_each_group_member_head(_evsel, _leader, _head) \ +for ((_evsel) = list_entry((_leader)->core.node.next, struct evsel, core.node); \ + (_evsel) && &(_evsel)->core.node != (_head) && \ + (_evsel)->core.leader == &(_leader)->core; \ + (_evsel) = list_entry((_evsel)->core.node.next, struct evsel, core.node)) + +#define for_each_group_member(_evsel, _leader) \ + for_each_group_member_head(_evsel, _leader, &(_leader)->evlist->core.entries) /* Iterates group WITH the leader. */ -#define for_each_group_evsel(_evsel, _leader) \ -for ((_evsel) = _leader; \ - (_evsel) && (_evsel)->core.leader == (&_leader->core); \ - (_evsel) = list_entry((_evsel)->core.node.next, struct evsel, core.node)) +#define for_each_group_evsel_head(_evsel, _leader, _head) \ +for ((_evsel) = _leader; \ + (_evsel) && &(_evsel)->core.node != (_head) && \ + (_evsel)->core.leader == &(_leader)->core; \ + (_evsel) = list_entry((_evsel)->core.node.next, struct evsel, core.node)) + +#define for_each_group_evsel(_evsel, _leader) \ + for_each_group_evsel_head(_evsel, _leader, &(_leader)->evlist->core.entries) static inline bool evsel__has_branch_callstack(const struct evsel *evsel) { diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 79e42d66f55b..a1655fd7ed9b 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -2,6 +2,7 @@ #include #include #include +#include "util/evlist.h" #include "evsel.h" #include "util/evsel_fprintf.h" #include "util/event.h" -- cgit v1.2.3 From 5c6e7c21ae94bd01cd2a808f806dace6b31956f3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:46 -0700 Subject: perf header: Make nodes dynamic in write_mem_topology() Avoid a large static array, dynamically allocate the nodes avoiding a hard coded limited as well. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 0c69109c0a3b..d85b39079c31 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -24,6 +24,7 @@ #include #endif #include +#include // reallocarray #include "dso.h" #include "evlist.h" @@ -1396,13 +1397,14 @@ static int memory_node__sort(const void *a, const void *b) return na->node - nb->node; } -static int build_mem_topology(struct memory_node *nodes, u64 size, u64 *cntp) +static int build_mem_topology(struct memory_node **nodesp, u64 *cntp) { char path[PATH_MAX]; struct dirent *ent; DIR *dir; - u64 cnt = 0; int ret = 0; + size_t cnt = 0, size = 0; + struct memory_node *nodes = NULL; scnprintf(path, PATH_MAX, "%s/devices/system/node/", sysfs__mountpoint()); @@ -1426,26 +1428,32 @@ static int build_mem_topology(struct memory_node *nodes, u64 size, u64 *cntp) if (r != 1) continue; - if (WARN_ONCE(cnt >= size, - "failed to write MEM_TOPOLOGY, way too many nodes\n")) { - closedir(dir); - return -1; - } + if (cnt >= size) { + struct memory_node *new_nodes = + reallocarray(nodes, cnt + 4, sizeof(*nodes)); + if (!new_nodes) { + pr_err("Failed to write MEM_TOPOLOGY, size %zd nodes\n", size); + ret = -ENOMEM; + goto out; + } + nodes = new_nodes; + size += 4; + } ret = memory_node__read(&nodes[cnt++], idx); } - - *cntp = cnt; +out: closedir(dir); - - if (!ret) + if (!ret) { + *cntp = cnt; + *nodesp = nodes; qsort(nodes, cnt, sizeof(nodes[0]), memory_node__sort); + } else + free(nodes); return ret; } -#define MAX_MEMORY_NODES 2000 - /* * The MEM_TOPOLOGY holds physical memory map for every * node in system. The format of data is as follows: @@ -1464,8 +1472,8 @@ static int build_mem_topology(struct memory_node *nodes, u64 size, u64 *cntp) static int write_mem_topology(struct feat_fd *ff __maybe_unused, struct evlist *evlist __maybe_unused) { - static struct memory_node nodes[MAX_MEMORY_NODES]; - u64 bsize, version = 1, i, nr; + struct memory_node *nodes = NULL; + u64 bsize, version = 1, i, nr = 0; int ret; ret = sysfs__read_xll("devices/system/memory/block_size_bytes", @@ -1473,7 +1481,7 @@ static int write_mem_topology(struct feat_fd *ff __maybe_unused, if (ret) return ret; - ret = build_mem_topology(&nodes[0], MAX_MEMORY_NODES, &nr); + ret = build_mem_topology(&nodes, &nr); if (ret) return ret; @@ -1508,6 +1516,7 @@ static int write_mem_topology(struct feat_fd *ff __maybe_unused, } out: + free(nodes); return ret; } -- cgit v1.2.3 From b1d870a8bbd8389823a86f33c7832afc442be353 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:47 -0700 Subject: perf test x86: insn-x86 test data is immutable so mark it const This allows the movement of some sizeable data arrays (168,624 bytes) to .data.relro. Without PIE or the strings it could be moved to .rodata. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/insn-x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/arch/x86/tests/insn-x86.c b/tools/perf/arch/x86/tests/insn-x86.c index 735257d205b5..7b5eb8baf0f2 100644 --- a/tools/perf/arch/x86/tests/insn-x86.c +++ b/tools/perf/arch/x86/tests/insn-x86.c @@ -18,14 +18,14 @@ struct test_data { const char *asm_rep; }; -struct test_data test_data_32[] = { +const struct test_data test_data_32[] = { #include "insn-x86-dat-32.c" {{0x0f, 0x01, 0xee}, 3, 0, NULL, NULL, "0f 01 ee \trdpkru"}, {{0x0f, 0x01, 0xef}, 3, 0, NULL, NULL, "0f 01 ef \twrpkru"}, {{0}, 0, 0, NULL, NULL, NULL}, }; -struct test_data test_data_64[] = { +const struct test_data test_data_64[] = { #include "insn-x86-dat-64.c" {{0x0f, 0x01, 0xee}, 3, 0, NULL, NULL, "0f 01 ee \trdpkru"}, {{0x0f, 0x01, 0xef}, 3, 0, NULL, NULL, "0f 01 ef \twrpkru"}, @@ -97,7 +97,7 @@ static int get_branch(const char *branch_str) return -1; } -static int test_data_item(struct test_data *dat, int x86_64) +static int test_data_item(const struct test_data *dat, int x86_64) { struct intel_pt_insn intel_pt_insn; int op, branch, ret; @@ -147,9 +147,9 @@ static int test_data_item(struct test_data *dat, int x86_64) return 0; } -static int test_data_set(struct test_data *dat_set, int x86_64) +static int test_data_set(const struct test_data *dat_set, int x86_64) { - struct test_data *dat; + const struct test_data *dat; int ret = 0; for (dat = dat_set; dat->expected_length; dat++) { -- cgit v1.2.3 From 7c1d862eda7f11cabd6941caee1404aad2d41458 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:48 -0700 Subject: perf test x86: intel-pt-test data is immutable so mark it const This allows the movement of 5,808 bytes from .data to .rodata. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/intel-pt-test.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/perf/arch/x86/tests/intel-pt-test.c b/tools/perf/arch/x86/tests/intel-pt-test.c index 70b7f79396b1..09d61fa736e3 100644 --- a/tools/perf/arch/x86/tests/intel-pt-test.c +++ b/tools/perf/arch/x86/tests/intel-pt-test.c @@ -22,7 +22,7 @@ * @new_ctx: expected new packet context * @ctx_unchanged: the packet context must not change */ -static struct test_data { +static const struct test_data { int len; u8 bytes[INTEL_PT_PKT_MAX_SZ]; enum intel_pt_pkt_ctx ctx; @@ -186,7 +186,7 @@ static struct test_data { {0, {0}, 0, {0, 0, 0}, 0, 0 }, }; -static int dump_packet(struct intel_pt_pkt *packet, u8 *bytes, int len) +static int dump_packet(const struct intel_pt_pkt *packet, const u8 *bytes, int len) { char desc[INTEL_PT_PKT_DESC_MAX]; int ret, i; @@ -206,14 +206,14 @@ static int dump_packet(struct intel_pt_pkt *packet, u8 *bytes, int len) return TEST_OK; } -static void decoding_failed(struct test_data *d) +static void decoding_failed(const struct test_data *d) { pr_debug("Decoding failed!\n"); pr_debug("Decoding: "); dump_packet(&d->packet, d->bytes, d->len); } -static int fail(struct test_data *d, struct intel_pt_pkt *packet, int len, +static int fail(const struct test_data *d, struct intel_pt_pkt *packet, int len, enum intel_pt_pkt_ctx new_ctx) { decoding_failed(d); @@ -242,7 +242,7 @@ static int fail(struct test_data *d, struct intel_pt_pkt *packet, int len, return TEST_FAIL; } -static int test_ctx_unchanged(struct test_data *d, struct intel_pt_pkt *packet, +static int test_ctx_unchanged(const struct test_data *d, struct intel_pt_pkt *packet, enum intel_pt_pkt_ctx ctx) { enum intel_pt_pkt_ctx old_ctx = ctx; @@ -258,7 +258,7 @@ static int test_ctx_unchanged(struct test_data *d, struct intel_pt_pkt *packet, return TEST_OK; } -static int test_one(struct test_data *d) +static int test_one(const struct test_data *d) { struct intel_pt_pkt packet; enum intel_pt_pkt_ctx ctx = d->ctx; @@ -307,7 +307,7 @@ static int test_one(struct test_data *d) */ int test__intel_pt_pkt_decoder(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - struct test_data *d = data; + const struct test_data *d = data; int ret; for (d = data; d->len; d++) { -- cgit v1.2.3 From 60995604d11a5588ddd813030e2adc3b77e9af50 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:49 -0700 Subject: perf trace: Make some large static arrays const to move it to .data.rel.ro Allows the movement of 33,128 bytes from .data to .data.rel.ro. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index b49d3abb1203..62c7c99a0fe4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -914,7 +914,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #include "trace/beauty/socket_type.c" #include "trace/beauty/waitid_options.c" -static struct syscall_fmt syscall_fmts[] = { +static const struct syscall_fmt syscall_fmts[] = { { .name = "access", .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, { .name = "arch_prctl", @@ -1176,18 +1176,21 @@ static int syscall_fmt__cmp(const void *name, const void *fmtp) return strcmp(name, fmt->name); } -static struct syscall_fmt *__syscall_fmt__find(struct syscall_fmt *fmts, const int nmemb, const char *name) +static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts, + const int nmemb, + const char *name) { return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); } -static struct syscall_fmt *syscall_fmt__find(const char *name) +static const struct syscall_fmt *syscall_fmt__find(const char *name) { const int nmemb = ARRAY_SIZE(syscall_fmts); return __syscall_fmt__find(syscall_fmts, nmemb, name); } -static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts, const int nmemb, const char *alias) +static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts, + const int nmemb, const char *alias) { int i; @@ -1199,7 +1202,7 @@ static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts return NULL; } -static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) +static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) { const int nmemb = ARRAY_SIZE(syscall_fmts); return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias); @@ -1224,7 +1227,7 @@ struct syscall { bool nonexistent; struct tep_format_field *args; const char *name; - struct syscall_fmt *fmt; + const struct syscall_fmt *fmt; struct syscall_arg_fmt *arg_fmt; }; @@ -1673,7 +1676,7 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) return 0; } -static struct syscall_arg_fmt syscall_arg_fmts__by_name[] = { +static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = { { .name = "msr", .scnprintf = SCA_X86_MSR, .strtoul = STUL_X86_MSR, }, { .name = "vector", .scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, }, }; @@ -1684,13 +1687,14 @@ static int syscall_arg_fmt__cmp(const void *name, const void *fmtp) return strcmp(name, fmt->name); } -static struct syscall_arg_fmt * -__syscall_arg_fmt__find_by_name(struct syscall_arg_fmt *fmts, const int nmemb, const char *name) +static const struct syscall_arg_fmt * +__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb, + const char *name) { return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp); } -static struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name) +static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name) { const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name); return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name); @@ -1735,8 +1739,9 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field * 7 unsigned long */ arg->scnprintf = SCA_FD; - } else { - struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); + } else { + const struct syscall_arg_fmt *fmt = + syscall_arg_fmt__find_by_name(field->name); if (fmt) { arg->scnprintf = fmt->scnprintf; @@ -4458,7 +4463,7 @@ static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name) struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt) { - struct syscall_fmt *scfmt = syscall_fmt__find(name); + const struct syscall_fmt *scfmt = syscall_fmt__find(name); if (scfmt) { int skip = 0; @@ -4525,7 +4530,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; - struct syscall_fmt *fmt; + const struct syscall_fmt *fmt; if (strace_groups_dir == NULL) return -1; -- cgit v1.2.3 From 1fc88e5a2d5358c9a8ae9fc992b75d34ed360339 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:50 -0700 Subject: perf trace beauty: Make MSR arrays const to move it to .data.rel.ro Allows the movement of 46,072 bytes from .data to .data.rel.ro. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-6-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/beauty.h | 2 +- tools/perf/trace/beauty/tracepoints/x86_msr.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 4c59edddd6a8..3d12bf0f6d07 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -11,7 +11,7 @@ struct strarray { u64 offset; int nr_entries; const char *prefix; - const char **entries; + const char * const *entries; }; #define DEFINE_STRARRAY(array, _prefix) struct strarray strarray__##array = { \ diff --git a/tools/perf/trace/beauty/tracepoints/x86_msr.sh b/tools/perf/trace/beauty/tracepoints/x86_msr.sh index 0078689963e0..fa3c4418e856 100755 --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh @@ -13,7 +13,7 @@ x86_msr_index=${arch_x86_header_dir}/msr-index.h # Just the ones starting with 0x00000 so as to have a simple # array. -printf "static const char *x86_MSRs[] = {\n" +printf "static const char * const x86_MSRs[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*' grep -E $regex ${x86_msr_index} | grep -E -v 'MSR_(ATOM|P[46]|IA32_(TSC_DEADLINE|UCODE_REV)|IDT_FCR4)' | \ sed -r "s/$regex/\2 \1/g" | sort -n | \ @@ -24,7 +24,7 @@ printf "};\n\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0xc0000[[:xdigit:]]+)[[:space:]]*.*' printf "#define x86_64_specific_MSRs_offset " grep -E $regex ${x86_msr_index} | sed -r "s/$regex/\2/g" | sort -n | head -1 -printf "static const char *x86_64_specific_MSRs[] = {\n" +printf "static const char * const x86_64_specific_MSRs[] = {\n" grep -E $regex ${x86_msr_index} | \ sed -r "s/$regex/\2 \1/g" | grep -E -vw 'K6_WHCR' | sort -n | \ xargs printf "\t[%s - x86_64_specific_MSRs_offset] = \"%s\",\n" @@ -33,7 +33,7 @@ printf "};\n\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0xc0010[[:xdigit:]]+)[[:space:]]*.*' printf "#define x86_AMD_V_KVM_MSRs_offset " grep -E $regex ${x86_msr_index} | sed -r "s/$regex/\2/g" | sort -n | head -1 -printf "static const char *x86_AMD_V_KVM_MSRs[] = {\n" +printf "static const char * const x86_AMD_V_KVM_MSRs[] = {\n" grep -E $regex ${x86_msr_index} | \ sed -r "s/$regex/\2 \1/g" | sort -n | \ xargs printf "\t[%s - x86_AMD_V_KVM_MSRs_offset] = \"%s\",\n" -- cgit v1.2.3 From 89df62c3ca1746177e5f1bae540b6b85c27aadcd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:51 -0700 Subject: tools api fs: Avoid large static PATH_MAX arrays Change struct fs to have a pointer to a dynamically allocated array rather than an array. This reduces the size of fs__entries from 24,768 bytes to 240 bytes. Read paths into a stack allocated array and strdup. Fix off-by-1 fscanf %s in fs__read_mounts caught by address sanitizer. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-7-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 82f53d81a7a7..22d34a0be8b4 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -88,7 +88,7 @@ static const char * const bpf_fs__known_mountpoints[] = { struct fs { const char *name; const char * const *mounts; - char path[PATH_MAX]; + char *path; bool found; bool checked; long magic; @@ -151,17 +151,23 @@ static bool fs__read_mounts(struct fs *fs) bool found = false; char type[100]; FILE *fp; + char path[PATH_MAX + 1]; fp = fopen("/proc/mounts", "r"); if (fp == NULL) - return NULL; + return false; while (!found && fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n", - fs->path, type) == 2) { + path, type) == 2) { - if (strcmp(type, fs->name) == 0) + if (strcmp(type, fs->name) == 0) { + free(fs->path); + fs->path = strdup(path); + if (!fs->path) + return false; found = true; + } } fclose(fp); @@ -188,8 +194,11 @@ static bool fs__check_mounts(struct fs *fs) ptr = fs->mounts; while (*ptr) { if (fs__valid_mount(*ptr, fs->magic) == 0) { + free(fs->path); + fs->path = strdup(*ptr); + if (!fs->path) + return false; fs->found = true; - strcpy(fs->path, *ptr); return true; } ptr++; @@ -227,10 +236,12 @@ static bool fs__env_override(struct fs *fs) if (!override_path) return false; + free(fs->path); + fs->path = strdup(override_path); + if (!fs->path) + return false; fs->found = true; fs->checked = true; - strncpy(fs->path, override_path, sizeof(fs->path) - 1); - fs->path[sizeof(fs->path) - 1] = '\0'; return true; } -- cgit v1.2.3 From 20dcad8f03117e50df569d18f6709d68807fedb8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:52 -0700 Subject: tools lib api fs tracing_path: Remove two unused MAX_PATH paths tracing_mnt was set but never written. tracing_events_path was set and read on errors paths, but its value is exactly tracing_path with a "/events" appended, so we can derive the value in the error paths. There appears to have been a missing "/" when tracing_events_path was initialized. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-8-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/tracing_path.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c index 7ba3e81274e8..30745f35d0d2 100644 --- a/tools/lib/api/fs/tracing_path.c +++ b/tools/lib/api/fs/tracing_path.c @@ -13,17 +13,12 @@ #include "tracing_path.h" -static char tracing_mnt[PATH_MAX] = "/sys/kernel/debug"; static char tracing_path[PATH_MAX] = "/sys/kernel/tracing"; -static char tracing_events_path[PATH_MAX] = "/sys/kernel/tracing/events"; static void __tracing_path_set(const char *tracing, const char *mountpoint) { - snprintf(tracing_mnt, sizeof(tracing_mnt), "%s", mountpoint); snprintf(tracing_path, sizeof(tracing_path), "%s/%s", mountpoint, tracing); - snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s%s", - mountpoint, tracing, "events"); } static const char *tracing_path_tracefs_mount(void) @@ -149,15 +144,15 @@ int tracing_path__strerror_open_tp(int err, char *buf, size_t size, /* sdt markers */ if (!strncmp(filename, "sdt_", 4)) { snprintf(buf, size, - "Error:\tFile %s/%s not found.\n" + "Error:\tFile %s/events/%s not found.\n" "Hint:\tSDT event cannot be directly recorded on.\n" "\tPlease first use 'perf probe %s:%s' before recording it.\n", - tracing_events_path, filename, sys, name); + tracing_path, filename, sys, name); } else { snprintf(buf, size, - "Error:\tFile %s/%s not found.\n" + "Error:\tFile %s/events/%s not found.\n" "Hint:\tPerhaps this kernel misses some CONFIG_ setting to enable this feature?.\n", - tracing_events_path, filename); + tracing_path, filename); } break; } @@ -169,9 +164,9 @@ int tracing_path__strerror_open_tp(int err, char *buf, size_t size, break; case EACCES: { snprintf(buf, size, - "Error:\tNo permissions to read %s/%s\n" + "Error:\tNo permissions to read %s/events/%s\n" "Hint:\tTry 'sudo mount -o remount,mode=755 %s'\n", - tracing_events_path, filename, tracing_path_mount()); + tracing_path, filename, tracing_path_mount()); } break; default: -- cgit v1.2.3 From 92294b906e6c55d67ef929a4762d9878d5cb75ac Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:53 -0700 Subject: perf daemon: Dynamically allocate path to perf Avoid a PATH_MAX array in __daemon (the .data section) by dynamically allocating the memory. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-9-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 34cbe3e959aa..f5674d824a40 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -90,7 +90,7 @@ struct daemon { char *base; struct list_head sessions; FILE *out; - char perf[PATH_MAX]; + char *perf; int signal_fd; time_t start; }; @@ -1490,6 +1490,14 @@ static int __cmd_ping(struct daemon *daemon, struct option parent_options[], return send_cmd(daemon, &cmd); } +static char *alloc_perf_exe_path(void) +{ + char path[PATH_MAX]; + + perf_exe(path, sizeof(path)); + return strdup(path); +} + int cmd_daemon(int argc, const char **argv) { struct option daemon_options[] = { @@ -1502,8 +1510,12 @@ int cmd_daemon(int argc, const char **argv) "field separator", "print counts with custom separator", ","), OPT_END() }; + int ret = -1; + + __daemon.perf = alloc_perf_exe_path(); + if (!__daemon.perf) + return -ENOMEM; - perf_exe(__daemon.perf, sizeof(__daemon.perf)); __daemon.out = stdout; argc = parse_options(argc, argv, daemon_options, daemon_usage, @@ -1511,22 +1523,22 @@ int cmd_daemon(int argc, const char **argv) if (argc) { if (!strcmp(argv[0], "start")) - return __cmd_start(&__daemon, daemon_options, argc, argv); + ret = __cmd_start(&__daemon, daemon_options, argc, argv); if (!strcmp(argv[0], "signal")) - return __cmd_signal(&__daemon, daemon_options, argc, argv); + ret = __cmd_signal(&__daemon, daemon_options, argc, argv); else if (!strcmp(argv[0], "stop")) - return __cmd_stop(&__daemon, daemon_options, argc, argv); + ret = __cmd_stop(&__daemon, daemon_options, argc, argv); else if (!strcmp(argv[0], "ping")) - return __cmd_ping(&__daemon, daemon_options, argc, argv); - - pr_err("failed: unknown command '%s'\n", argv[0]); - return -1; - } - - if (setup_config(&__daemon)) { - pr_err("failed: config not found\n"); - return -1; + ret = __cmd_ping(&__daemon, daemon_options, argc, argv); + else + pr_err("failed: unknown command '%s'\n", argv[0]); + } else { + ret = setup_config(&__daemon); + if (ret) + pr_err("failed: config not found\n"); + else + ret = send_cmd_list(&__daemon); } - - return send_cmd_list(&__daemon); + zfree(&__daemon.perf); + return ret; } -- cgit v1.2.3 From eef4fee5e52071d563d9a851df1c09869215ee15 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:54 -0700 Subject: perf lock: Dynamically allocate lockhash_table lockhash_table is 32,768 bytes in .bss, make it a memory allocation so that the space is freed for non-lock perf commands. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-10-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 70b14ba5fdd5..fc8356bd6e3a 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -48,7 +48,7 @@ static struct target target; #define LOCKHASH_BITS 12 #define LOCKHASH_SIZE (1UL << LOCKHASH_BITS) -static struct hlist_head lockhash_table[LOCKHASH_SIZE]; +static struct hlist_head *lockhash_table; #define __lockhashfn(key) hash_long((unsigned long)key, LOCKHASH_BITS) #define lockhashentry(key) (lockhash_table + __lockhashfn((key))) @@ -1871,7 +1871,6 @@ static int __cmd_contention(int argc, const char **argv) }; struct lock_contention con = { .target = &target, - .result = &lockhash_table[0], .map_nr_entries = bpf_map_entries, .max_stack = max_stack_depth, .stack_skip = stack_skip, @@ -1880,10 +1879,17 @@ static int __cmd_contention(int argc, const char **argv) .owner = show_lock_owner, }; + lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); + if (!lockhash_table) + return -ENOMEM; + + con.result = &lockhash_table[0]; + session = perf_session__new(use_bpf ? NULL : &data, &eops); if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); - return PTR_ERR(session); + err = PTR_ERR(session); + goto out_delete; } con.machine = &session->machines.host; @@ -1983,6 +1989,7 @@ out_delete: evlist__delete(con.evlist); lock_contention_finish(); perf_session__delete(session); + zfree(&lockhash_table); return err; } @@ -2348,6 +2355,10 @@ int cmd_lock(int argc, const char **argv) unsigned int i; int rc = 0; + lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); + if (!lockhash_table) + return -ENOMEM; + for (i = 0; i < LOCKHASH_SIZE; i++) INIT_HLIST_HEAD(lockhash_table + i); @@ -2369,7 +2380,7 @@ int cmd_lock(int argc, const char **argv) rc = __cmd_report(false); } else if (!strcmp(argv[0], "script")) { /* Aliased to 'perf script' */ - return cmd_script(argc, argv); + rc = cmd_script(argc, argv); } else if (!strcmp(argv[0], "info")) { if (argc) { argc = parse_options(argc, argv, @@ -2403,5 +2414,6 @@ int cmd_lock(int argc, const char **argv) usage_with_options(lock_usage, lock_options); } + zfree(&lockhash_table); return rc; } -- cgit v1.2.3 From ddc27bb8a9a5c0236ae65c3451d9c7024040d11d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:55 -0700 Subject: perf timechart: Make large arrays dynamic Allocate start time and state arrays when command starts rather than using 114,688 bytes in .bss. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-11-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-timechart.c | 48 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index bce1cf896f9c..829d99fecfd0 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -315,10 +315,10 @@ static void pid_put_sample(struct timechart *tchart, int pid, int type, #define MAX_CPUS 4096 -static u64 cpus_cstate_start_times[MAX_CPUS]; -static int cpus_cstate_state[MAX_CPUS]; -static u64 cpus_pstate_start_times[MAX_CPUS]; -static u64 cpus_pstate_state[MAX_CPUS]; +static u64 *cpus_cstate_start_times; +static int *cpus_cstate_state; +static u64 *cpus_pstate_start_times; +static u64 *cpus_pstate_state; static int process_comm_event(struct perf_tool *tool, union perf_event *event, @@ -1981,12 +1981,34 @@ int cmd_timechart(int argc, const char **argv) "perf timechart record []", NULL }; + int ret; + + cpus_cstate_start_times = calloc(MAX_CPUS, sizeof(*cpus_cstate_start_times)); + if (!cpus_cstate_start_times) + return -ENOMEM; + cpus_cstate_state = calloc(MAX_CPUS, sizeof(*cpus_cstate_state)); + if (!cpus_cstate_state) { + ret = -ENOMEM; + goto out; + } + cpus_pstate_start_times = calloc(MAX_CPUS, sizeof(*cpus_pstate_start_times)); + if (!cpus_pstate_start_times) { + ret = -ENOMEM; + goto out; + } + cpus_pstate_state = calloc(MAX_CPUS, sizeof(*cpus_pstate_state)); + if (!cpus_pstate_state) { + ret = -ENOMEM; + goto out; + } + argc = parse_options_subcommand(argc, argv, timechart_options, timechart_subcommands, timechart_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (tchart.power_only && tchart.tasks_only) { pr_err("-P and -T options cannot be used at the same time.\n"); - return -1; + ret = -1; + goto out; } if (argc && strlen(argv[0]) > 2 && strstarts("record", argv[0])) { @@ -1996,17 +2018,25 @@ int cmd_timechart(int argc, const char **argv) if (tchart.power_only && tchart.tasks_only) { pr_err("-P and -T options cannot be used at the same time.\n"); - return -1; + ret = -1; + goto out; } if (tchart.io_only) - return timechart__io_record(argc, argv); + ret = timechart__io_record(argc, argv); else - return timechart__record(&tchart, argc, argv); + ret = timechart__record(&tchart, argc, argv); + goto out; } else if (argc) usage_with_options(timechart_usage, timechart_options); setup_pager(); - return __cmd_timechart(&tchart, output_name); + ret = __cmd_timechart(&tchart, output_name); +out: + zfree(&cpus_cstate_start_times); + zfree(&cpus_cstate_state); + zfree(&cpus_pstate_start_times); + zfree(&cpus_pstate_state); + return ret; } -- cgit v1.2.3 From 430952e6d7a02bbf4d2d4a6d3baa7ce4b66052d7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:56 -0700 Subject: perf probe: Dynamically allocate params memory Avoid 14,432 bytes in .bss by dynamically allocating params. Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20230526183401.2326121-12-irogers@google.com Cc: K Prateek Nayak Cc: Ravi Bangoria Cc: Mark Rutland Cc: Ross Zwisler Cc: Steven Rostedt (Google) Cc: Sean Christopherson Cc: Yang Jihong Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Masami Hiramatsu (Google) Cc: Namhyung Kim Cc: Leo Yan Cc: Andi Kleen Cc: Alexander Shishkin Cc: Kan Liang Cc: Tiezhu Yang Cc: Ingo Molnar Cc: Paolo Bonzini Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-probe.c | 133 ++++++++++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 62 deletions(-) diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 4df05b992093..019fef8da6a8 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -47,29 +47,29 @@ static struct { char *target; struct strfilter *filter; struct nsinfo *nsi; -} params; +} *params; /* Parse an event definition. Note that any error must die. */ static int parse_probe_event(const char *str) { - struct perf_probe_event *pev = ¶ms.events[params.nevents]; + struct perf_probe_event *pev = ¶ms->events[params->nevents]; int ret; - pr_debug("probe-definition(%d): %s\n", params.nevents, str); - if (++params.nevents == MAX_PROBES) { + pr_debug("probe-definition(%d): %s\n", params->nevents, str); + if (++params->nevents == MAX_PROBES) { pr_err("Too many probes (> %d) were specified.", MAX_PROBES); return -1; } - pev->uprobes = params.uprobes; - if (params.target) { - pev->target = strdup(params.target); + pev->uprobes = params->uprobes; + if (params->target) { + pev->target = strdup(params->target); if (!pev->target) return -ENOMEM; - params.target_used = true; + params->target_used = true; } - pev->nsi = nsinfo__get(params.nsi); + pev->nsi = nsinfo__get(params->nsi); /* Parse a perf-probe command into event */ ret = parse_perf_probe_command(str, pev); @@ -84,12 +84,12 @@ static int params_add_filter(const char *str) int ret = 0; pr_debug2("Add filter: %s\n", str); - if (!params.filter) { - params.filter = strfilter__new(str, &err); - if (!params.filter) + if (!params->filter) { + params->filter = strfilter__new(str, &err); + if (!params->filter) ret = err ? -EINVAL : -ENOMEM; } else - ret = strfilter__or(params.filter, str, &err); + ret = strfilter__or(params->filter, str, &err); if (ret == -EINVAL) { pr_err("Filter parse error at %td.\n", err - str + 1); @@ -112,17 +112,17 @@ static int set_target(const char *ptr) * TODO: Support relative path, and $PATH, $LD_LIBRARY_PATH, * short module name. */ - if (!params.target && ptr && *ptr == '/') { - params.target = strdup(ptr); - if (!params.target) + if (!params->target && ptr && *ptr == '/') { + params->target = strdup(ptr); + if (!params->target) return -ENOMEM; - params.target_used = false; + params->target_used = false; found = 1; buf = ptr + (strlen(ptr) - 3); if (strcmp(buf, ".ko")) - params.uprobes = true; + params->uprobes = true; } @@ -172,15 +172,15 @@ static int opt_set_target(const struct option *opt, const char *str, if (str) { if (!strcmp(opt->long_name, "exec")) - params.uprobes = true; + params->uprobes = true; else if (!strcmp(opt->long_name, "module")) - params.uprobes = false; + params->uprobes = false; else return ret; /* Expand given path to absolute path, except for modulename */ - if (params.uprobes || strchr(str, '/')) { - tmp = nsinfo__realpath(str, params.nsi); + if (params->uprobes || strchr(str, '/')) { + tmp = nsinfo__realpath(str, params->nsi); if (!tmp) { pr_warning("Failed to get the absolute path of %s: %m\n", str); return ret; @@ -190,9 +190,9 @@ static int opt_set_target(const struct option *opt, const char *str, if (!tmp) return -ENOMEM; } - free(params.target); - params.target = tmp; - params.target_used = false; + free(params->target); + params->target = tmp; + params->target_used = false; ret = 0; } @@ -217,7 +217,7 @@ static int opt_set_target_ns(const struct option *opt __maybe_unused, } nsip = nsinfo__new(ns_pid); if (nsip && nsinfo__need_setns(nsip)) - params.nsi = nsinfo__get(nsip); + params->nsi = nsinfo__get(nsip); nsinfo__put(nsip); ret = 0; @@ -238,14 +238,14 @@ static int opt_show_lines(const struct option *opt, if (!str) return 0; - if (params.command == 'L') { + if (params->command == 'L') { pr_warning("Warning: more than one --line options are" " detected. Only the first one is valid.\n"); return 0; } - params.command = opt->short_name; - ret = parse_line_range_desc(str, ¶ms.line_range); + params->command = opt->short_name; + ret = parse_line_range_desc(str, ¶ms->line_range); return ret; } @@ -253,7 +253,7 @@ static int opt_show_lines(const struct option *opt, static int opt_show_vars(const struct option *opt, const char *str, int unset __maybe_unused) { - struct perf_probe_event *pev = ¶ms.events[params.nevents]; + struct perf_probe_event *pev = ¶ms->events[params->nevents]; int ret; if (!str) @@ -264,7 +264,7 @@ static int opt_show_vars(const struct option *opt, pr_err(" Error: '--vars' doesn't accept arguments.\n"); return -EINVAL; } - params.command = opt->short_name; + params->command = opt->short_name; return ret; } @@ -276,7 +276,7 @@ static int opt_add_probe_event(const struct option *opt, const char *str, int unset __maybe_unused) { if (str) { - params.command = opt->short_name; + params->command = opt->short_name; return parse_probe_event(str); } @@ -287,7 +287,7 @@ static int opt_set_filter_with_command(const struct option *opt, const char *str, int unset) { if (!unset) - params.command = opt->short_name; + params->command = opt->short_name; if (str) return params_add_filter(str); @@ -306,20 +306,29 @@ static int opt_set_filter(const struct option *opt __maybe_unused, static int init_params(void) { - return line_range__init(¶ms.line_range); + int ret; + + params = calloc(1, sizeof(*params)); + if (!params) + return -ENOMEM; + + ret = line_range__init(¶ms->line_range); + if (ret) + zfree(¶ms); + return ret; } static void cleanup_params(void) { int i; - for (i = 0; i < params.nevents; i++) - clear_perf_probe_event(params.events + i); - line_range__clear(¶ms.line_range); - free(params.target); - strfilter__delete(params.filter); - nsinfo__put(params.nsi); - memset(¶ms, 0, sizeof(params)); + for (i = 0; i < params->nevents; i++) + clear_perf_probe_event(params->events + i); + line_range__clear(¶ms->line_range); + free(params->target); + strfilter__delete(params->filter); + nsinfo__put(params->nsi); + zfree(¶ms); } static void pr_err_with_code(const char *msg, int err) @@ -346,7 +355,7 @@ static int perf_add_probe_events(struct perf_probe_event *pevs, int npevs) if (ret < 0) goto out_cleanup; - if (params.command == 'D') { /* it shows definition */ + if (params->command == 'D') { /* it shows definition */ if (probe_conf.bootconfig) ret = show_bootconfig_events(pevs, npevs); else @@ -635,7 +644,7 @@ __cmd_probe(int argc, const char **argv) usage_with_options_msg(probe_usage, options, "'-' is not supported.\n"); } - if (params.command && params.command != 'a') { + if (params->command && params->command != 'a') { usage_with_options_msg(probe_usage, options, "another command except --add is set.\n"); } @@ -644,7 +653,7 @@ __cmd_probe(int argc, const char **argv) pr_err_with_code(" Error: Command Parse Error.", ret); return ret; } - params.command = 'a'; + params->command = 'a'; } ret = symbol__validate_sym_arguments(); @@ -664,54 +673,54 @@ __cmd_probe(int argc, const char **argv) * nor change running kernel. So if user gives offline vmlinux, * ignore its buildid. */ - if (!strchr("lda", params.command) && symbol_conf.vmlinux_name) + if (!strchr("lda", params->command) && symbol_conf.vmlinux_name) symbol_conf.ignore_vmlinux_buildid = true; - switch (params.command) { + switch (params->command) { case 'l': - if (params.uprobes) { + if (params->uprobes) { pr_err(" Error: Don't use --list with --exec.\n"); parse_options_usage(probe_usage, options, "l", true); parse_options_usage(NULL, options, "x", true); return -EINVAL; } - ret = show_perf_probe_events(params.filter); + ret = show_perf_probe_events(params->filter); if (ret < 0) pr_err_with_code(" Error: Failed to show event list.", ret); return ret; case 'F': - ret = show_available_funcs(params.target, params.nsi, - params.filter, params.uprobes); + ret = show_available_funcs(params->target, params->nsi, + params->filter, params->uprobes); if (ret < 0) pr_err_with_code(" Error: Failed to show functions.", ret); return ret; #ifdef HAVE_DWARF_SUPPORT case 'L': - ret = show_line_range(¶ms.line_range, params.target, - params.nsi, params.uprobes); + ret = show_line_range(¶ms->line_range, params->target, + params->nsi, params->uprobes); if (ret < 0) pr_err_with_code(" Error: Failed to show lines.", ret); return ret; case 'V': - if (!params.filter) - params.filter = strfilter__new(DEFAULT_VAR_FILTER, + if (!params->filter) + params->filter = strfilter__new(DEFAULT_VAR_FILTER, NULL); - ret = show_available_vars(params.events, params.nevents, - params.filter); + ret = show_available_vars(params->events, params->nevents, + params->filter); if (ret < 0) pr_err_with_code(" Error: Failed to show vars.", ret); return ret; #endif case 'd': - ret = perf_del_probe_events(params.filter); + ret = perf_del_probe_events(params->filter); if (ret < 0) { pr_err_with_code(" Error: Failed to delete events.", ret); return ret; } break; case 'D': - if (probe_conf.bootconfig && params.uprobes) { + if (probe_conf.bootconfig && params->uprobes) { pr_err(" Error: --bootconfig doesn't support uprobes.\n"); return -EINVAL; } @@ -719,25 +728,25 @@ __cmd_probe(int argc, const char **argv) case 'a': /* Ensure the last given target is used */ - if (params.target && !params.target_used) { + if (params->target && !params->target_used) { pr_err(" Error: -x/-m must follow the probe definitions.\n"); parse_options_usage(probe_usage, options, "m", true); parse_options_usage(NULL, options, "x", true); return -EINVAL; } - ret = perf_add_probe_events(params.events, params.nevents); + ret = perf_add_probe_events(params->events, params->nevents); if (ret < 0) { /* * When perf_add_probe_events() fails it calls * cleanup_perf_probe_events(pevs, npevs), i.e. - * cleanup_perf_probe_events(params.events, params.nevents), which + * cleanup_perf_probe_events(params->events, params->nevents), which * will call clear_perf_probe_event(), so set nevents to zero * to avoid cleanup_params() to call clear_perf_probe_event() again * on the same pevs. */ - params.nevents = 0; + params->nevents = 0; pr_err_with_code(" Error: Failed to add events.", ret); return ret; } -- cgit v1.2.3 From 370ce164defd18069518e8b7faa6c92aad740257 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:57 -0700 Subject: perf path: Make mkpath thread safe, remove 16384 bytes from .bss Avoid 4 static arrays for paths, pass in a char[] buffer to use. Makes mkpath thread safe for the small number of users. Also removes 16,384 bytes from .bss. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-config.c | 4 +++- tools/perf/builtin-help.c | 4 +++- tools/perf/util/cache.h | 2 +- tools/perf/util/config.c | 3 ++- tools/perf/util/path.c | 35 +++++------------------------------ 5 files changed, 14 insertions(+), 34 deletions(-) diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index 2603015f98be..2e8363778935 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -12,6 +12,7 @@ #include "util/debug.h" #include "util/config.h" #include +#include #include #include @@ -157,7 +158,8 @@ int cmd_config(int argc, const char **argv) { int i, ret = -1; struct perf_config_set *set; - char *user_config = mkpath("%s/.perfconfig", getenv("HOME")); + char path[PATH_MAX]; + char *user_config = mkpath(path, sizeof(path), "%s/.perfconfig", getenv("HOME")); const char *config_filename; bool changed = false; diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index 3e7f52054fac..b2a368ae295a 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -389,9 +390,10 @@ static int get_html_page_path(char **page_path, const char *page) { struct stat st; const char *html_path = system_path(PERF_HTML_PATH); + char path[PATH_MAX]; /* Check that we have a perf documentation directory. */ - if (stat(mkpath("%s/perf.html", html_path), &st) + if (stat(mkpath(path, sizeof(path), "%s/perf.html", html_path), &st) || !S_ISREG(st.st_mode)) { pr_err("'%s': not a documentation directory.", html_path); return -1; diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h index 9f2e36ef5072..0b61840d4226 100644 --- a/tools/perf/util/cache.h +++ b/tools/perf/util/cache.h @@ -26,6 +26,6 @@ static inline int is_absolute_path(const char *path) return path[0] == '/'; } -char *mkpath(const char *fmt, ...) __printf(1, 2); +char *mkpath(char *path_buf, size_t sz, const char *fmt, ...) __printf(3, 4); #endif /* __PERF_CACHE_H */ diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 658170b8dcef..f340dc73db6d 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -543,6 +543,7 @@ static char *home_perfconfig(void) const char *home = NULL; char *config; struct stat st; + char path[PATH_MAX]; home = getenv("HOME"); @@ -554,7 +555,7 @@ static char *home_perfconfig(void) if (!home || !*home || !perf_config_global()) return NULL; - config = strdup(mkpath("%s/.perfconfig", home)); + config = strdup(mkpath(path, sizeof(path), "%s/.perfconfig", home)); if (config == NULL) { pr_warning("Not enough memory to process %s/.perfconfig, ignoring it.\n", home); return NULL; diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c index ce80b79be103..00adf872bf00 100644 --- a/tools/perf/util/path.c +++ b/tools/perf/util/path.c @@ -1,16 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * I'm tired of doing "vsnprintf()" etc just to open a - * file, so here's a "return static buffer with printf" - * interface for paths. - * - * It's obviously not thread-safe. Sue me. But it's quite - * useful for doing things like - * - * f = open(mkpath("%s/%s.perf", base, name), O_RDONLY); - * - * which is what it's designed for. - */ #include "path.h" #include "cache.h" #include @@ -22,18 +10,6 @@ #include #include -static char bad_path[] = "/bad-path/"; -/* - * One hack: - */ -static char *get_pathname(void) -{ - static char pathname_array[4][PATH_MAX]; - static int idx; - - return pathname_array[3 & ++idx]; -} - static char *cleanup_path(char *path) { /* Clean it up */ @@ -45,18 +21,17 @@ static char *cleanup_path(char *path) return path; } -char *mkpath(const char *fmt, ...) +char *mkpath(char *path_buf, size_t sz, const char *fmt, ...) { va_list args; unsigned len; - char *pathname = get_pathname(); va_start(args, fmt); - len = vsnprintf(pathname, PATH_MAX, fmt, args); + len = vsnprintf(path_buf, sz, fmt, args); va_end(args); - if (len >= PATH_MAX) - return bad_path; - return cleanup_path(pathname); + if (len >= sz) + strncpy(path_buf, "/bad-path/", sz); + return cleanup_path(path_buf); } int path__join(char *bf, size_t size, const char *path1, const char *path2) -- cgit v1.2.3 From d9c26d45dbb51fe610f64b490f38f6ad15a00d7c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:58 -0700 Subject: perf scripting-engines: Move static to local variable, remove 16384 from .bss Avoid 16,384 bytes in .bss by stack allocating two bitmaps. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/scripting-engines/trace-event-perl.c | 4 ++-- tools/perf/util/scripting-engines/trace-event-python.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 039d0365ad41..65b761d83a1f 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -67,8 +67,6 @@ INTERP my_perl; #define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(unsigned short) * 8)) - 1) -static DECLARE_BITMAP(events_defined, TRACE_EVENT_TYPE_MAX); - extern struct scripting_context *scripting_context; static char *cur_field_name; @@ -353,7 +351,9 @@ static void perl_process_tracepoint(struct perf_sample *sample, void *data = sample->raw_data; unsigned long long nsecs = sample->time; const char *comm = thread__comm_str(thread); + DECLARE_BITMAP(events_defined, TRACE_EVENT_TYPE_MAX); + bitmap_zero(events_defined, TRACE_EVENT_TYPE_MAX); dSP; if (evsel->core.attr.type != PERF_TYPE_TRACEPOINT) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 41d4f9e6a8b7..40964078f92f 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -93,8 +93,6 @@ PyMODINIT_FUNC PyInit_perf_trace_context(void); #define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(unsigned short) * 8)) - 1) -static DECLARE_BITMAP(events_defined, TRACE_EVENT_TYPE_MAX); - #define N_COMMON_FIELDS 7 static char *cur_field_name; @@ -934,6 +932,9 @@ static void python_process_tracepoint(struct perf_sample *sample, unsigned long long nsecs = sample->time; const char *comm = thread__comm_str(al->thread); const char *default_handler_name = "trace_unhandled"; + DECLARE_BITMAP(events_defined, TRACE_EVENT_TYPE_MAX); + + bitmap_zero(events_defined, TRACE_EVENT_TYPE_MAX); if (!event) { snprintf(handler_name, sizeof(handler_name), -- cgit v1.2.3 From 7a3fb8b5c4607b133a71d3f695d0f2653facec13 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:33:59 -0700 Subject: tools api fs: Dynamically allocate cgroupfs mount point cache, removing 4128 bytes from .bss Move the cgroupfs_cache_entry 4128 byte array out of .bss. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/cgroup.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/lib/api/fs/cgroup.c b/tools/lib/api/fs/cgroup.c index 1573dae4259d..250629a09423 100644 --- a/tools/lib/api/fs/cgroup.c +++ b/tools/lib/api/fs/cgroup.c @@ -14,7 +14,7 @@ struct cgroupfs_cache_entry { }; /* just cache last used one */ -static struct cgroupfs_cache_entry cached; +static struct cgroupfs_cache_entry *cached; int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys) { @@ -24,9 +24,9 @@ int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys) char *p, *path; char mountpoint[PATH_MAX]; - if (!strcmp(cached.subsys, subsys)) { - if (strlen(cached.mountpoint) < maxlen) { - strcpy(buf, cached.mountpoint); + if (cached && !strcmp(cached->subsys, subsys)) { + if (strlen(cached->mountpoint) < maxlen) { + strcpy(buf, cached->mountpoint); return 0; } return -1; @@ -91,8 +91,13 @@ int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys) free(line); fclose(fp); - strncpy(cached.subsys, subsys, sizeof(cached.subsys) - 1); - strcpy(cached.mountpoint, mountpoint); + if (!cached) + cached = calloc(1, sizeof(*cached)); + + if (cached) { + strncpy(cached->subsys, subsys, sizeof(cached->subsys) - 1); + strcpy(cached->mountpoint, mountpoint); + } if (mountpoint[0] && strlen(mountpoint) < maxlen) { strcpy(buf, mountpoint); -- cgit v1.2.3 From f50b8357f8955c899b704db88ffc180c5bf3f680 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:34:00 -0700 Subject: perf test pmu: Avoid 2 static path arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid two static paths that contributed 8,192 bytes to .bss are only used duing the perf parse pmu test. This change helps FORTIFY triggering 2 warnings like: ``` tests/pmu.c: In function ‘test__pmu’: tests/pmu.c:121:43: error: ‘%s’ directive output may be truncated writing up to 4095 bytes into a region of size 4090 [-Werror=format-truncation=] 121 | snprintf(buf, sizeof(buf), "rm -f %s/*\n", dir); ``` So make buf a little larger. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 3cf25f883df7..a4452639a3d4 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -86,17 +86,16 @@ static struct parse_events_term test_terms[] = { * Prepare format directory data, exported by kernel * at /sys/bus/event_source/devices//format. */ -static char *test_format_dir_get(void) +static char *test_format_dir_get(char *dir, size_t sz) { - static char dir[PATH_MAX]; unsigned int i; - snprintf(dir, PATH_MAX, "/tmp/perf-pmu-test-format-XXXXXX"); + snprintf(dir, sz, "/tmp/perf-pmu-test-format-XXXXXX"); if (!mkdtemp(dir)) return NULL; for (i = 0; i < ARRAY_SIZE(test_formats); i++) { - static char name[PATH_MAX]; + char name[PATH_MAX]; struct test_format *format = &test_formats[i]; FILE *file; @@ -118,12 +117,13 @@ static char *test_format_dir_get(void) /* Cleanup format directory. */ static int test_format_dir_put(char *dir) { - char buf[PATH_MAX]; - snprintf(buf, PATH_MAX, "rm -f %s/*\n", dir); + char buf[PATH_MAX + 20]; + + snprintf(buf, sizeof(buf), "rm -f %s/*\n", dir); if (system(buf)) return -1; - snprintf(buf, PATH_MAX, "rmdir %s\n", dir); + snprintf(buf, sizeof(buf), "rmdir %s\n", dir); return system(buf); } @@ -140,7 +140,8 @@ static struct list_head *test_terms_list(void) static int test__pmu(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - char *format = test_format_dir_get(); + char dir[PATH_MAX]; + char *format = test_format_dir_get(dir, sizeof(dir)); LIST_HEAD(formats); struct list_head *terms = test_terms_list(); int ret; -- cgit v1.2.3 From 200323768787a0ee02e01c35c1aff13dc9d77dde Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 11:34:01 -0700 Subject: libsubcmd: Avoid two path statics, removing 8192 bytes from .bss Use a single stack allocated buffer and avoid 8,192 bytes in .bss. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Ross Zwisler Cc: Sean Christopherson Cc: Steven Rostedt (VMware) Cc: Tiezhu Yang Cc: Yang Jihong Link: https://lore.kernel.org/r/20230526183401.2326121-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/exec-cmd.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/tools/lib/subcmd/exec-cmd.c b/tools/lib/subcmd/exec-cmd.c index 5dbea456973e..7739b5217cf6 100644 --- a/tools/lib/subcmd/exec-cmd.c +++ b/tools/lib/subcmd/exec-cmd.c @@ -36,38 +36,40 @@ static int is_absolute_path(const char *path) return path[0] == '/'; } -static const char *get_pwd_cwd(void) +static const char *get_pwd_cwd(char *buf, size_t sz) { - static char cwd[PATH_MAX + 1]; char *pwd; struct stat cwd_stat, pwd_stat; - if (getcwd(cwd, PATH_MAX) == NULL) + if (getcwd(buf, sz) == NULL) return NULL; pwd = getenv("PWD"); - if (pwd && strcmp(pwd, cwd)) { - stat(cwd, &cwd_stat); + if (pwd && strcmp(pwd, buf)) { + stat(buf, &cwd_stat); if (!stat(pwd, &pwd_stat) && pwd_stat.st_dev == cwd_stat.st_dev && pwd_stat.st_ino == cwd_stat.st_ino) { - strlcpy(cwd, pwd, PATH_MAX); + strlcpy(buf, pwd, sz); } } - return cwd; + return buf; } -static const char *make_nonrelative_path(const char *path) +static const char *make_nonrelative_path(char *buf, size_t sz, const char *path) { - static char buf[PATH_MAX + 1]; - if (is_absolute_path(path)) { - if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX) + if (strlcpy(buf, path, sz) >= sz) die("Too long path: %.*s", 60, path); } else { - const char *cwd = get_pwd_cwd(); + const char *cwd = get_pwd_cwd(buf, sz); + if (!cwd) die("Cannot determine the current working directory"); - if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX) + + if (strlen(cwd) + strlen(path) + 2 >= sz) die("Too long path: %.*s", 60, path); + + strcat(buf, "/"); + strcat(buf, path); } return buf; } @@ -133,8 +135,11 @@ static void add_path(char **out, const char *path) if (path && *path) { if (is_absolute_path(path)) astrcat(out, path); - else - astrcat(out, make_nonrelative_path(path)); + else { + char buf[PATH_MAX]; + + astrcat(out, make_nonrelative_path(buf, sizeof(buf), path)); + } astrcat(out, ":"); } -- cgit v1.2.3 From 422db30713ac84080a8c4b3efa9dd560b654ed57 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 29 May 2023 19:14:33 -0700 Subject: perf kvm powerpc: Add missing rename opf pmu_have_event() to perf_pmus__have_event() Missed function rename from pmu_have_event to perf_pmus__have_event made the perf build fail on powerpc. Committer notes: The perf_pmus__have_event() is declared in util/pmus.h, so use it instead of by now needless util/pmu.h. Fixes: 1eaf496ed386934f ("perf pmu: Separate pmu and pmus") Reported-by: Stephen Rothwell Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230530021433.3107580-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/kvm-stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index ea1220d66b67..d9a0ac1cdf30 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -5,7 +5,7 @@ #include "util/debug.h" #include "util/evsel.h" #include "util/evlist.h" -#include "util/pmu.h" +#include "util/pmus.h" #include "book3s_hv_exits.h" #include "book3s_hcalls.h" @@ -204,7 +204,7 @@ int kvm_add_default_arch_event(int *argc, const char **argv) parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); if (!event) { - if (pmu_have_event("trace_imc", "trace_cycles")) { + if (perf_pmus__have_event("trace_imc", "trace_cycles")) { argv[j++] = strdup("-e"); argv[j++] = strdup("trace_imc/trace_cycles/"); *argc += 2; -- cgit v1.2.3 From e23421426e1364299e9ee5091022058ba10fee63 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 1 Jun 2023 01:29:51 -0700 Subject: perf pmu: Correct perf_pmu__auto_merge_stats() affecting hybrid Flip the return value correcting a bug. Fixes: 6b9da260703096b3 ("perf pmu: Remove is_pmu_hybrid") Reported-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230601082954.754318-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 0520aa9fe991..36e163f38368 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1424,7 +1424,7 @@ bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) { - return pmu->is_core && perf_pmus__num_core_pmus() > 1; + return pmu->is_core && perf_pmus__num_core_pmus() == 1; } bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name) -- cgit v1.2.3 From 1f4326bf83ce02ae8f7d50240c367fbb7bf28343 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 1 Jun 2023 01:29:52 -0700 Subject: perf evsel: Add verbose 3 print of evsel name when opening It is often useful to know not just the attribute and perf_event_open() details when opening an evsel, but also the evsel's name. Add this debug output for verbose 3 so that it won't interfere with the current verbose 2 output. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230601082954.754318-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ed2e07186b71..f607b5bddc76 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2039,6 +2039,7 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, fallback_missing_features: evsel__disable_missing_features(evsel); + pr_debug3("Opening: %s\n", evsel__name(evsel)); display_attr(&evsel->core.attr); for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) { -- cgit v1.2.3 From 251aa040244a3b17068e4e6ec61f138d7e50681a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 1 Jun 2023 01:29:53 -0700 Subject: perf parse-events: Wildcard most "numeric" events Numeric events are either raw events or those with ABI defined numbers matched by the lexer. PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events should wildcard match on hybrid systems. So "cycles" should match each PMU type with an extended type, not just PERF_TYPE_HARDWARE. Change wildcard matching to add the event even if wildcard PMU scanning fails, there will be no extended type but this best matches previous behavior. Only set the extended type when the event type supports it and when perf_pmus__supports_extended_type is true. This new function returns true if >1 core PMU and avoids potential errors on older kernels. Modify evsel__compute_group_pmu_name using a helper perf_pmu__is_software to determine when grouping should occur. Try to use PMUs, and evsel__find_pmu, as being more dependable than evsel->pmu_name. Set a parse events error if a hardware term's PMU lookup fails, to provide extra diagnostics. Fixes: 8bc75f699c141420 ("perf parse-events: Support wildcards on raw events") Reported-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230601082954.754318-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 104 +++++++++++++++++++++++++++++------------ tools/perf/util/parse-events.y | 10 ++-- tools/perf/util/pmu.c | 16 +++++++ tools/perf/util/pmu.h | 5 ++ tools/perf/util/pmus.c | 5 ++ tools/perf/util/pmus.h | 1 + 6 files changed, 106 insertions(+), 35 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 7f047ac11168..26979a47f4ac 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -372,7 +372,7 @@ static int config_attr(struct perf_event_attr *attr, * contain hyphens and the longest name * should always be selected. */ -int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config) +int parse_events__decode_legacy_cache(const char *name, int extended_pmu_type, __u64 *config) { int len, cache_type = -1, cache_op = -1, cache_result = -1; const char *name_end = &name[strlen(name) + 1]; @@ -423,8 +423,9 @@ int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *con if (cache_result == -1) cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS; - *config = ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT) | - cache_type | (cache_op << 8) | (cache_result << 16); + *config = cache_type | (cache_op << 8) | (cache_result << 16); + if (perf_pmus__supports_extended_type()) + *config |= (__u64)extended_pmu_type << PERF_PMU_TYPE_SHIFT; return 0; } @@ -1204,11 +1205,17 @@ static int config_term_pmu(struct perf_event_attr *attr, const struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); if (!pmu) { - pr_debug("Failed to find PMU for type %d", attr->type); + char *err_str; + + if (asprintf(&err_str, "Failed to find PMU for type %d", attr->type) >= 0) + parse_events_error__handle(err, term->err_term, + err_str, /*help=*/NULL); return -EINVAL; } attr->type = PERF_TYPE_HARDWARE; - attr->config = ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT) | term->val.num; + attr->config = term->val.num; + if (perf_pmus__supports_extended_type()) + attr->config |= (__u64)pmu->type << PERF_PMU_TYPE_SHIFT; return 0; } if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER || @@ -1435,8 +1442,8 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx, static int __parse_events_add_numeric(struct parse_events_state *parse_state, struct list_head *list, - struct perf_pmu *pmu, u32 type, u64 config, - struct list_head *head_config) + struct perf_pmu *pmu, u32 type, u32 extended_type, + u64 config, struct list_head *head_config) { struct perf_event_attr attr; LIST_HEAD(config_terms); @@ -1446,6 +1453,10 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state, memset(&attr, 0, sizeof(attr)); attr.type = type; attr.config = config; + if (extended_type && (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)) { + assert(perf_pmus__supports_extended_type()); + attr.config |= (u64)extended_type << PERF_PMU_TYPE_SHIFT; + }; if (head_config) { if (config_attr(&attr, head_config, parse_state->error, @@ -1474,24 +1485,26 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, struct perf_pmu *pmu = NULL; bool found_supported = false; - if (!wildcard) - return __parse_events_add_numeric(parse_state, list, /*pmu=*/NULL, - type, config, head_config); - /* Wildcards on numeric values are only supported by core PMUs. */ - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { - int ret; + if (wildcard && perf_pmus__supports_extended_type()) { + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + int ret; - if (parse_events__filter_pmu(parse_state, pmu)) - continue; + found_supported = true; + if (parse_events__filter_pmu(parse_state, pmu)) + continue; - found_supported = true; - ret = __parse_events_add_numeric(parse_state, list, pmu, pmu->type, - config, head_config); - if (ret) - return ret; + ret = __parse_events_add_numeric(parse_state, list, pmu, + type, pmu->type, + config, head_config); + if (ret) + return ret; + } + if (found_supported) + return 0; } - return found_supported ? 0 : -EINVAL; + return __parse_events_add_numeric(parse_state, list, perf_pmus__find_by_type(type), + type, /*extended_type=*/0, config, head_config); } int parse_events_add_tool(struct parse_events_state *parse_state, @@ -1989,8 +2002,22 @@ static int evsel__compute_group_pmu_name(struct evsel *evsel, { struct evsel *leader = evsel__leader(evsel); struct evsel *pos; - const char *group_pmu_name = evsel->pmu_name ?: "cpu"; + const char *group_pmu_name; + struct perf_pmu *pmu = evsel__find_pmu(evsel); + if (!pmu) { + /* + * For PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE types the PMU + * is a core PMU, but in heterogeneous systems this is + * unknown. For now pick the first core PMU. + */ + pmu = perf_pmus__scan_core(NULL); + } + if (!pmu) { + pr_debug("No PMU found for '%s'", evsel__name(evsel)); + return -EINVAL; + } + group_pmu_name = pmu->name; /* * Software events may be in a group with other uncore PMU events. Use * the pmu_name of the first non-software event to avoid breaking the @@ -1999,24 +2026,41 @@ static int evsel__compute_group_pmu_name(struct evsel *evsel, * Aux event leaders, like intel_pt, expect a group with events from * other PMUs, so substitute the AUX event's PMU in this case. */ - if (evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) { + if (perf_pmu__is_software(pmu) || evsel__is_aux_event(leader)) { + struct perf_pmu *leader_pmu = evsel__find_pmu(leader); + + if (!leader_pmu) { + /* As with determining pmu above. */ + leader_pmu = perf_pmus__scan_core(NULL); + } /* * Starting with the leader, find the first event with a named - * PMU. for_each_group_(member|evsel) isn't used as the list - * isn't yet sorted putting evsel's in the same group together. + * non-software PMU. for_each_group_(member|evsel) isn't used as + * the list isn't yet sorted putting evsel's in the same group + * together. */ - if (leader->pmu_name) { - group_pmu_name = leader->pmu_name; + if (leader_pmu && !perf_pmu__is_software(leader_pmu)) { + group_pmu_name = leader_pmu->name; } else if (leader->core.nr_members > 1) { list_for_each_entry(pos, head, core.node) { - if (evsel__leader(pos) == leader && pos->pmu_name) { - group_pmu_name = pos->pmu_name; + struct perf_pmu *pos_pmu; + + if (pos == leader || evsel__leader(pos) != leader) + continue; + pos_pmu = evsel__find_pmu(pos); + if (!pos_pmu) { + /* As with determining pmu above. */ + pos_pmu = perf_pmus__scan_core(NULL); + } + if (pos_pmu && !perf_pmu__is_software(pos_pmu)) { + group_pmu_name = pos_pmu->name; break; } } } } - evsel->group_pmu_name = strdup(group_pmu_name); + /* Assign the actual name taking care that the fake PMU lacks a name. */ + evsel->group_pmu_name = strdup(group_pmu_name ?: "fake"); return evsel->group_pmu_name ? 0 : -ENOMEM; } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index abd6ab460e12..f96afb0edd0c 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -445,11 +445,11 @@ value_sym '/' event_config '/' int type = $1 >> 16; int config = $1 & 255; int err; + bool wildcard = (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE); list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_numeric(_parse_state, list, type, config, $3, - /*wildcard=*/false); + err = parse_events_add_numeric(_parse_state, list, type, config, $3, wildcard); parse_events_terms__delete($3); if (err) { free_list_evsel(list); @@ -463,12 +463,12 @@ value_sym sep_slash_slash_dc struct list_head *list; int type = $1 >> 16; int config = $1 & 255; + bool wildcard = (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE); list = alloc_list(); ABORT_ON(!list); ABORT_ON(parse_events_add_numeric(_parse_state, list, type, config, - /*head_config=*/NULL, - /*wildcard=*/false)); + /*head_config=*/NULL, wildcard)); $$ = list; } | @@ -635,7 +635,7 @@ PE_RAW opt_event_config ABORT_ON(errno); free($1); err = parse_events_add_numeric(_parse_state, list, PERF_TYPE_RAW, num, $2, - /*wildcard=*/true); + /*wildcard=*/false); parse_events_terms__delete($2); if (err) { free(list); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 36e163f38368..1dd44b2f73f3 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1438,6 +1438,22 @@ bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name) return false; } +bool perf_pmu__is_software(const struct perf_pmu *pmu) +{ + if (pmu->is_core || pmu->is_uncore || pmu->auxtrace) + return false; + switch (pmu->type) { + case PERF_TYPE_HARDWARE: return false; + case PERF_TYPE_SOFTWARE: return true; + case PERF_TYPE_TRACEPOINT: return true; + case PERF_TYPE_HW_CACHE: return false; + case PERF_TYPE_RAW: return false; + case PERF_TYPE_BREAKPOINT: return true; + default: break; + } + return !strcmp(pmu->name, "kprobe") || !strcmp(pmu->name, "uprobe"); +} + FILE *perf_pmu__open_file(struct perf_pmu *pmu, const char *name) { char path[PATH_MAX]; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 287f593b15c7..13a9a893e665 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -224,6 +224,11 @@ bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu); bool perf_pmu__have_event(const struct perf_pmu *pmu, const char *name); +/** + * perf_pmu_is_software - is the PMU a software PMU as in it uses the + * perf_sw_context in the kernel? + */ +bool perf_pmu__is_software(const struct perf_pmu *pmu); FILE *perf_pmu__open_file(struct perf_pmu *pmu, const char *name); FILE *perf_pmu__open_file_at(struct perf_pmu *pmu, int dirfd, const char *name); diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 53f11f6ce878..e1d0a93147e5 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -477,6 +477,11 @@ int perf_pmus__num_core_pmus(void) return count; } +bool perf_pmus__supports_extended_type(void) +{ + return perf_pmus__num_core_pmus() > 1; +} + struct perf_pmu *evsel__find_pmu(const struct evsel *evsel) { struct perf_pmu *pmu = evsel->pmu; diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index 1e710720aec7..d02ffea5d3a4 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -19,5 +19,6 @@ int perf_pmus__num_mem_pmus(void); void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state); bool perf_pmus__have_event(const char *pname, const char *name); int perf_pmus__num_core_pmus(void); +bool perf_pmus__supports_extended_type(void); #endif /* __PMUS_H */ -- cgit v1.2.3 From 27c9fcfc1e14ca9dff930d55cadd8ee4a34e4321 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 1 Jun 2023 01:29:54 -0700 Subject: perf test: Update parse-events expectations to test for multiple events With PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events opening on multiple PMUs, the test expectations need updating to test for multiple events. TODOs are added to document existing hybrid perf bugs. Tested on hybrid alderlake and non-hybrid tigerlake. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230601082954.754318-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 1108 +++++++++++++++++++++------------------ 1 file changed, 590 insertions(+), 518 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 9d05bc551791..bba1cd655a1d 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -324,13 +324,17 @@ static int test__checkevent_numeric_modifier(struct evlist *evlist) static int test__checkevent_symbolic_name_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct perf_evsel *evsel; - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == perf_pmus__num_core_pmus()); + perf_evlist__for_each_entry(&evlist->core, evsel) { + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + } return test__checkevent_symbolic_name(evlist); } @@ -620,24 +624,28 @@ static int test__checkevent_pmu_events(struct evlist *evlist) static int test__checkevent_pmu_events_mix(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); - - /* pmu-event:u */ - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong exclude_user", - !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", - evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); - TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); + struct evsel *evsel = NULL; + /* + * The wild card event will be opened at least once, but it may be + * opened on each core PMU. + */ + TEST_ASSERT_VAL("wrong number of entries", evlist->core.nr_entries >= 2); + for (int i = 0; i < evlist->core.nr_entries - 1; i++) { + evsel = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + /* pmu-event:u */ + TEST_ASSERT_VAL("wrong exclude_user", + !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", + evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); + TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); + } /* cpu/pmu-event/u*/ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type || - strcmp(evsel->pmu_name, "cpu")); + TEST_ASSERT_VAL("wrong type", evsel__find_pmu(evsel)->is_core); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", @@ -734,181 +742,207 @@ static int test__group1(struct evlist *evlist) { struct evsel *evsel, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); - - /* instructions:k */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* cycles:upp */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - /* use of precise requires exclude_guest */ - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 2); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (perf_pmus__num_core_pmus() * 2)); + TEST_ASSERT_VAL("wrong number of groups", + evlist__nr_groups(evlist) == perf_pmus__num_core_pmus()); + + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* instructions:k */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + /* cycles:upp */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + /* use of precise requires exclude_guest */ + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 2); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } return TEST_OK; } static int test__group2(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel, *leader = NULL; - TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (2 * perf_pmus__num_core_pmus() + 1)); + /* + * TODO: Currently the software event won't be grouped with the hardware + * event except for 1 PMU. + */ TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); - /* faults + :ku modifier */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_SW_PAGE_FAULTS)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* cache-references + :u modifier */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_REFERENCES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* cycles:k */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type == PERF_TYPE_SOFTWARE) { + /* faults + :ku modifier */ + leader = evsel; + TEST_ASSERT_VAL("wrong config", + test_config(evsel, PERF_COUNT_SW_PAGE_FAULTS)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + continue; + } + if (evsel->core.attr.type == PERF_TYPE_HARDWARE && + test_config(evsel, PERF_COUNT_HW_CACHE_REFERENCES)) { + /* cache-references + :u modifier */ + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + if (evsel__has_leader(evsel, leader)) + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + continue; + } + /* cycles:k */ + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } return TEST_OK; } #ifdef HAVE_LIBTRACEEVENT static int test__group3(struct evlist *evlist __maybe_unused) { - struct evsel *evsel, *leader; + struct evsel *evsel, *group1_leader = NULL, *group2_leader = NULL; - TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (3 * perf_pmus__num_core_pmus() + 2)); + /* + * Currently the software event won't be grouped with the hardware event + * except for 1 PMU. This means there are always just 2 groups + * regardless of the number of core PMUs. + */ TEST_ASSERT_VAL("wrong number of groups", 2 == evlist__nr_groups(evlist)); - /* group1 syscalls:sys_enter_openat:H */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong sample_type", - PERF_TP_SAMPLE_TYPE == evsel->core.attr.sample_type); - TEST_ASSERT_VAL("wrong sample_period", 1 == evsel->core.attr.sample_period); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong group name", - !strcmp(leader->group_name, "group1")); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* group1 cycles:kppp */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - /* use of precise requires exclude_guest */ - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 3); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* group2 cycles + G modifier */ - evsel = leader = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong group name", - !strcmp(leader->group_name, "group2")); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* group2 1:3 + G modifier */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, 3)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* instructions:u */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type == PERF_TYPE_TRACEPOINT) { + /* group1 syscalls:sys_enter_openat:H */ + group1_leader = evsel; + TEST_ASSERT_VAL("wrong sample_type", + evsel->core.attr.sample_type == PERF_TP_SAMPLE_TYPE); + TEST_ASSERT_VAL("wrong sample_period", 1 == evsel->core.attr.sample_period); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong group name", !strcmp(evsel->group_name, "group1")); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + continue; + } + if (evsel->core.attr.type == PERF_TYPE_HARDWARE && + test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)) { + if (evsel->core.attr.exclude_user) { + /* group1 cycles:kppp */ + TEST_ASSERT_VAL("wrong exclude_user", + evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", + !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + /* use of precise requires exclude_guest */ + TEST_ASSERT_VAL("wrong exclude guest", + evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", + !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", + evsel->core.attr.precise_ip == 3); + if (evsel__has_leader(evsel, group1_leader)) { + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong group_idx", + evsel__group_idx(evsel) == 1); + } + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } else { + /* group2 cycles + G modifier */ + group2_leader = evsel; + TEST_ASSERT_VAL("wrong exclude_kernel", + !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", + !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", + !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", + evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + if (evsel->core.nr_members == 2) { + TEST_ASSERT_VAL("wrong group_idx", + evsel__group_idx(evsel) == 0); + } + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } + continue; + } + if (evsel->core.attr.type == 1) { + /* group2 1:3 + G modifier */ + TEST_ASSERT_VAL("wrong config", test_config(evsel, 3)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + if (evsel__has_leader(evsel, group2_leader)) + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + continue; + } + /* instructions:u */ + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } return TEST_OK; } #endif @@ -917,402 +951,435 @@ static int test__group4(struct evlist *evlist __maybe_unused) { struct evsel *evsel, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); - - /* cycles:u + p */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - /* use of precise requires exclude_guest */ - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 1); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* instructions:kp + p */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); - TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - /* use of precise requires exclude_guest */ - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 2); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (perf_pmus__num_core_pmus() * 2)); + TEST_ASSERT_VAL("wrong number of groups", + perf_pmus__num_core_pmus() == evlist__nr_groups(evlist)); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles:u + p */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + /* use of precise requires exclude_guest */ + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 1); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + + /* instructions:kp + p */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + /* use of precise requires exclude_guest */ + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip == 2); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } return TEST_OK; } static int test__group5(struct evlist *evlist __maybe_unused) { - struct evsel *evsel, *leader; - - TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 2 == evlist__nr_groups(evlist)); - - /* cycles + G */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + struct evsel *evsel = NULL, *leader; - /* instructions + G */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (5 * perf_pmus__num_core_pmus())); + TEST_ASSERT_VAL("wrong number of groups", + evlist__nr_groups(evlist) == (2 * perf_pmus__num_core_pmus())); - /* cycles:G */ - evsel = leader = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - - /* instructions:G */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles + G */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); - /* cycles */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + /* instructions + G */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + } + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles:G */ + evsel = leader = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong sample_read", !evsel->sample_read); + /* instructions:G */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + } + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + } return TEST_OK; } static int test__group_gh1(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (2 * perf_pmus__num_core_pmus())); + TEST_ASSERT_VAL("wrong number of groups", + evlist__nr_groups(evlist) == perf_pmus__num_core_pmus()); - /* cycles + :H group modifier */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - - /* cache-misses:G + :H group modifier */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles + :H group modifier */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + /* cache-misses:G + :H group modifier */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + } return TEST_OK; } static int test__group_gh2(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); - - /* cycles + :G group modifier */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (2 * perf_pmus__num_core_pmus())); + TEST_ASSERT_VAL("wrong number of groups", + evlist__nr_groups(evlist) == perf_pmus__num_core_pmus()); - /* cache-misses:H + :G group modifier */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles + :G group modifier */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + /* cache-misses:H + :G group modifier */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + } return TEST_OK; } static int test__group_gh3(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (2 * perf_pmus__num_core_pmus())); + TEST_ASSERT_VAL("wrong number of groups", + evlist__nr_groups(evlist) == perf_pmus__num_core_pmus()); - /* cycles:G + :u group modifier */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - - /* cache-misses:H + :u group modifier */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles:G + :u group modifier */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + /* cache-misses:H + :u group modifier */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + } return TEST_OK; } static int test__group_gh4(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); - TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (2 * perf_pmus__num_core_pmus())); + TEST_ASSERT_VAL("wrong number of groups", + evlist__nr_groups(evlist) == perf_pmus__num_core_pmus()); - /* cycles:G + :uG group modifier */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); - TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); - - /* cache-misses:H + :uG group modifier */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles:G + :uG group modifier */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + /* cache-misses:H + :uG group modifier */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + } return TEST_OK; } static int test__leader_sample1(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (3 * perf_pmus__num_core_pmus())); - /* cycles - sampling group leader */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); - - /* cache-misses - not sampling */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles - sampling group leader */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); - /* branch-misses - not sampling */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + /* cache-misses - not sampling */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + /* branch-misses - not sampling */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + } return TEST_OK; } static int test__leader_sample2(struct evlist *evlist __maybe_unused) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (2 * perf_pmus__num_core_pmus())); - /* instructions - sampling group leader */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); - - /* branch-misses - not sampling */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); - TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); - TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* instructions - sampling group leader */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + /* branch-misses - not sampling */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + TEST_ASSERT_VAL("wrong sample_read", evsel->sample_read); + } return TEST_OK; } static int test__checkevent_pinned_modifier(struct evlist *evlist) { - struct evsel *evsel = evlist__first(evlist); + struct evsel *evsel = NULL; - TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); - TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); - TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); - TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); - TEST_ASSERT_VAL("wrong pinned", evsel->core.attr.pinned); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == perf_pmus__num_core_pmus()); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + evsel = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong pinned", evsel->core.attr.pinned); + } return test__checkevent_symbolic_name(evlist); } static int test__pinned_group(struct evlist *evlist) { - struct evsel *evsel, *leader; + struct evsel *evsel = NULL, *leader; - TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (3 * perf_pmus__num_core_pmus())); - /* cycles - group leader */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong pinned", evsel->core.attr.pinned); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles - group leader */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + /* TODO: The group modifier is not copied to the split group leader. */ + if (perf_pmus__num_core_pmus() == 1) + TEST_ASSERT_VAL("wrong pinned", evsel->core.attr.pinned); - /* cache-misses - can not be pinned, but will go on with the leader */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); - - /* branch-misses - ditto */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); - TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); + /* cache-misses - can not be pinned, but will go on with the leader */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); + /* branch-misses - ditto */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); + } return TEST_OK; } @@ -1331,29 +1398,33 @@ static int test__checkevent_exclusive_modifier(struct evlist *evlist) static int test__exclusive_group(struct evlist *evlist) { - struct evsel *evsel, *leader; - - TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries); + struct evsel *evsel = NULL, *leader; - /* cycles - group leader */ - evsel = leader = evlist__first(evlist); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); - TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); - TEST_ASSERT_VAL("wrong exclusive", evsel->core.attr.exclusive); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (3 * perf_pmus__num_core_pmus())); - /* cache-misses - can not be pinned, but will go on with the leader */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); - TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); + for (int i = 0; i < perf_pmus__num_core_pmus(); i++) { + /* cycles - group leader */ + evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + /* TODO: The group modifier is not copied to the split group leader. */ + if (perf_pmus__num_core_pmus() == 1) + TEST_ASSERT_VAL("wrong exclusive", evsel->core.attr.exclusive); - /* branch-misses - ditto */ - evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); - TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); + /* cache-misses - can not be pinned, but will go on with the leader */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); + /* branch-misses - ditto */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); + } return TEST_OK; } static int test__checkevent_breakpoint_len(struct evlist *evlist) @@ -1403,7 +1474,8 @@ static int test__checkevent_precise_max_modifier(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); - TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong number of entries", + evlist->core.nr_entries == (1 + perf_pmus__num_core_pmus())); TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_SW_TASK_CLOCK)); return TEST_OK; -- cgit v1.2.3 From 68c250434125f94dddcff7e9faf392fa96773ac3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 31 May 2023 19:36:43 -0700 Subject: perf pmu: Only warn about unsupported formats once Avoid scanning format list for each event parsed. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Rob Herring Cc: Suzuki Poulouse Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230601023644.587584-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 5 +++++ tools/perf/util/pmu.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 1dd44b2f73f3..4218b5235b3d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -934,6 +934,11 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) { struct perf_pmu_format *format; + if (pmu->formats_checked) + return; + + pmu->formats_checked = true; + /* fake pmu doesn't have format list */ if (pmu == &perf_pmu__fake) return; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 13a9a893e665..c21872c0f328 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -76,6 +76,11 @@ struct perf_pmu { * specific code. */ bool auxtrace; + /** + * @formats_checked: Only check PMU's formats are valid for + * perf_event_attr once. + */ + bool formats_checked; /** * @max_precise: Number of levels of :ppp precision supported by the * PMU, read from -- cgit v1.2.3 From b9f010328c0f5af017b0fb9ca24a5c531bc3c682 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 31 May 2023 19:36:44 -0700 Subject: perf pmu: Warn about invalid config for all PMUs and configs Don't just check the raw PMU type, the only core PMU on homogeneous x86, check raw and all dynamically added PMUs. Extend the perf_pmu__warn_invalid_config to check all 4 config values. Rather than process the format list once per event, store the computed masks for each config value. Don't ignore the mask being zero, which is likely for config2 and config3, add config_masks_present so config values can be ignored only when no format information is present. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Rob Herring Cc: Suzuki Poulouse Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230601023644.587584-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 13 ++++++++++--- tools/perf/util/pmu.c | 38 +++++++++++++++++++++++++++----------- tools/perf/util/pmu.h | 13 ++++++++++++- 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 26979a47f4ac..629f7bd9fd59 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -245,9 +245,16 @@ __add_event(struct list_head *list, int *idx, if (pmu) perf_pmu__warn_invalid_formats(pmu); - if (pmu && attr->type == PERF_TYPE_RAW) - perf_pmu__warn_invalid_config(pmu, attr->config, name); - + if (pmu && (attr->type == PERF_TYPE_RAW || attr->type >= PERF_TYPE_MAX)) { + perf_pmu__warn_invalid_config(pmu, attr->config, name, + PERF_PMU_FORMAT_VALUE_CONFIG, "config"); + perf_pmu__warn_invalid_config(pmu, attr->config1, name, + PERF_PMU_FORMAT_VALUE_CONFIG1, "config1"); + perf_pmu__warn_invalid_config(pmu, attr->config2, name, + PERF_PMU_FORMAT_VALUE_CONFIG2, "config2"); + perf_pmu__warn_invalid_config(pmu, attr->config3, name, + PERF_PMU_FORMAT_VALUE_CONFIG3, "config3"); + } if (init_attr) event_attr_init(attr); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 4218b5235b3d..fe64ad292d36 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1627,37 +1627,53 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) return pmu->nr_caps; } -void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, - const char *name) +static void perf_pmu__compute_config_masks(struct perf_pmu *pmu) { struct perf_pmu_format *format; - __u64 masks = 0, bits; - char buf[100]; - unsigned int i; + + if (pmu->config_masks_computed) + return; list_for_each_entry(format, &pmu->format, list) { - if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG) + unsigned int i; + __u64 *mask; + + if (format->value >= PERF_PMU_FORMAT_VALUE_CONFIG_END) continue; + pmu->config_masks_present = true; + mask = &pmu->config_masks[format->value]; + for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS) - masks |= 1ULL << i; + *mask |= 1ULL << i; } + pmu->config_masks_computed = true; +} + +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + const char *name, int config_num, + const char *config_name) +{ + __u64 bits; + char buf[100]; + + perf_pmu__compute_config_masks(pmu); /* * Kernel doesn't export any valid format bits. */ - if (masks == 0) + if (!pmu->config_masks_present) return; - bits = config & ~masks; + bits = config & ~pmu->config_masks[config_num]; if (bits == 0) return; bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf)); - pr_warning("WARNING: event '%s' not valid (bits %s of config " + pr_warning("WARNING: event '%s' not valid (bits %s of %s " "'%llx' not supported by kernel)!\n", - name ?: "N/A", buf, config); + name ?: "N/A", buf, config_name, config); } int perf_pmu__match(char *pattern, char *name, char *tok) diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index c21872c0f328..8807a624e918 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -81,6 +81,10 @@ struct perf_pmu { * perf_event_attr once. */ bool formats_checked; + /** @config_masks_present: Are there config format values? */ + bool config_masks_present; + /** @config_masks_computed: Set when masks are lazily computed. */ + bool config_masks_computed; /** * @max_precise: Number of levels of :ppp precision supported by the * PMU, read from @@ -125,6 +129,12 @@ struct perf_pmu { /** @list: Element on pmus list in pmu.c. */ struct list_head list; + /** + * @config_masks: Derived from the PMU's format data, bits that are + * valid within the config value. + */ + __u64 config_masks[PERF_PMU_FORMAT_VALUE_CONFIG_END]; + /** * @missing_features: Features to inhibit when events on this PMU are * opened. @@ -260,7 +270,8 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval); int perf_pmu__caps_parse(struct perf_pmu *pmu); void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, - const char *name); + const char *name, int config_num, + const char *config_name); void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu); int perf_pmu__match(char *pattern, char *name, char *tok); -- cgit v1.2.3 From fe8e04348727f992f6fce3709639fb6d92a81137 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 May 2023 13:32:36 -0700 Subject: perf script: Increase PID/TID width for output On large systems, it's common that PID/TID is bigger than 5-digit and it makes the output unaligned. Let's increase the width to 7. Before: $ perf script ... swapper 0 [006] 1540823.803935: 1369324 cycles:P: ffffffff9c755588 ktime_get+0x18 ([kernel.kallsyms]) gvfsd-dnssd 95114 [004] 1540823.804164: 1643871 cycles:P: ffffffff9cfdca5c __get_user_8+0x1c ([kernel.kallsyms]) perf-exec 1558582 [000] 1540823.804209: 1018714 cycles:P: ffffffff9c924ab9 __slab_free+0x9 ([kernel.kallsyms]) nmcli 1558589 [007] 1540823.804384: 1859212 cycles:P: 7f70537a8ad8 __strchrnul_evex+0x18 (/usr/lib/x86_64-linux-gnu/libc.so.6> sleep 1558582 [000] 1540823.804456: 987425 cycles:P: 7fd35bb27b30 _dl_init+0x0 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2> dbus-daemon 3043 [003] 1540823.804575: 1564465 cycles:P: ffffffff9cb2bb70 llist_add_batch+0x0 ([kernel.kallsyms]) gdbus 1558592 [001] 1540823.804766: 1315219 cycles:P: ffffffff9c797b2e audit_filter_syscall+0x9e ([kernel.kallsyms]) NetworkManager 3452 [005] 1540823.805301: 1558782 cycles:P: 7fa957737748 g_bit_lock+0x58 (/usr/lib/x86_64-linux-gnu/libglib-2.0.so.0.7400.5> After: $ perf script ... swapper 0 [006] 1540823.803935: 1369324 cycles:P: ffffffff9c755588 ktime_get+0x18 ([kernel.kallsyms]) gvfsd-dnssd 95114 [004] 1540823.804164: 1643871 cycles:P: ffffffff9cfdca5c __get_user_8+0x1c ([kernel.kallsyms]) perf-exec 1558582 [000] 1540823.804209: 1018714 cycles:P: ffffffff9c924ab9 __slab_free+0x9 ([kernel.kallsyms]) nmcli 1558589 [007] 1540823.804384: 1859212 cycles:P: 7f70537a8ad8 __strchrnul_evex+0x18 (/usr/lib/x86_64-linux-gnu/libc.so.6> sleep 1558582 [000] 1540823.804456: 987425 cycles:P: 7fd35bb27b30 _dl_init+0x0 (/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2> dbus-daemon 3043 [003] 1540823.804575: 1564465 cycles:P: ffffffff9cb2bb70 llist_add_batch+0x0 ([kernel.kallsyms]) gdbus 1558592 [001] 1540823.804766: 1315219 cycles:P: ffffffff9c797b2e audit_filter_syscall+0x9e ([kernel.kallsyms]) NetworkManager 3452 [005] 1540823.805301: 1558782 cycles:P: 7fa957737748 g_bit_lock+0x58 (/usr/lib/x86_64-linux-gnu/libglib-2.0.so.0.7400.5> Reviewer notes: Adrian added: "Might be worth noting that currently the biggest PID_MAX_LIMIT is 2^22 so pids don't get bigger than 7 digits presently" $ echo $((2 ** 22)) 4194304 $ echo -n $((2 ** 22)) | wc -c 7 $ Signed-off-by: Namhyung Kim Acked-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20230531203236.1602054-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 029d5a597233..70549fc93b12 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -801,11 +801,11 @@ static int perf_sample__fprintf_start(struct perf_script *script, } if (PRINT_FIELD(PID) && PRINT_FIELD(TID)) - printed += fprintf(fp, "%5d/%-5d ", sample->pid, sample->tid); + printed += fprintf(fp, "%7d/%-7d ", sample->pid, sample->tid); else if (PRINT_FIELD(PID)) - printed += fprintf(fp, "%5d ", sample->pid); + printed += fprintf(fp, "%7d ", sample->pid); else if (PRINT_FIELD(TID)) - printed += fprintf(fp, "%5d ", sample->tid); + printed += fprintf(fp, "%7d ", sample->tid); if (PRINT_FIELD(CPU)) { if (latency_format) -- cgit v1.2.3 From 16203e9cd01896b4244100a8e3fb9f6e612ab2b1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 2 Jun 2023 15:38:25 -0300 Subject: perf bench: Add missing setlocale() call to allow usage of %'d style formatting Without this we were not getting the thousands separator for big numbers. Noticed while developing 'perf bench uprobe', but the use of %' predates that, for instance 'perf bench syscall' uses it. Before: # perf bench uprobe all # Running uprobe/baseline benchmark... # Executed 1000 usleep(1000) calls Total time: 1054082243ns 1054082.243000 nsecs/op # After: # perf bench uprobe all # Running uprobe/baseline benchmark... # Executed 1,000 usleep(1000) calls Total time: 1,053,715,144ns 1,053,715.144000 nsecs/op # Fixes: c2a08203052f8975 ("perf bench: Add basic syscall benchmark") Cc: Adrian Hunter Cc: Andre Fredette Cc: Clark Williams Cc: Dave Tucker Cc: Davidlohr Bueso Cc: Derek Barbosa Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Tiezhu Yang Link: https://lore.kernel.org/lkml/ZH3lcepZ4tBYr1jv@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-bench.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 58f1cfe1eb34..db435b791a09 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -21,6 +21,7 @@ #include "builtin.h" #include "bench/bench.h" +#include #include #include #include @@ -260,6 +261,7 @@ int cmd_bench(int argc, const char **argv) /* Unbuffered output */ setvbuf(stdout, NULL, _IONBF, 0); + setlocale(LC_ALL, ""); if (argc < 2) { /* No collection specified. */ -- cgit v1.2.3 From 49f3806d89e4cf9e330b6f2e39db1c913a8fd25a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 May 2023 18:09:57 +0800 Subject: perf tools: Declare syscalltbl_*[] as const for all archs syscalltbl_*[] should never be changing, let us declare it as const. Suggested-by: Ian Rogers Reviewed-by: Huacai Chen Signed-off-by: Tiezhu Yang Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/1685441401-8709-2-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/entry/syscalls/mksyscalltbl | 2 +- tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl | 2 +- tools/perf/arch/mips/entry/syscalls/mksyscalltbl | 2 +- tools/perf/arch/powerpc/entry/syscalls/mksyscalltbl | 2 +- tools/perf/arch/s390/entry/syscalls/mksyscalltbl | 2 +- tools/perf/arch/x86/entry/syscalls/syscalltbl.sh | 2 +- tools/perf/util/syscalltbl.c | 14 +++++++------- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl index 22cdf911dd9a..4edcdf6eb8ae 100755 --- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl @@ -34,7 +34,7 @@ create_table_from_c() create_table() { echo "#include \"$input\"" - echo "static const char *syscalltbl_arm64[] = {" + echo "static const char *const syscalltbl_arm64[] = {" create_table_from_c echo "};" } diff --git a/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl b/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl index c52156f7204d..5fb83bd87503 100755 --- a/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl @@ -50,7 +50,7 @@ create_table_from_c() create_table() { - echo "static const char *syscalltbl_loongarch[] = {" + echo "static const char *const syscalltbl_loongarch[] = {" create_table_from_c echo "};" } diff --git a/tools/perf/arch/mips/entry/syscalls/mksyscalltbl b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl index fb1f49451af6..c0d93f959c4e 100644 --- a/tools/perf/arch/mips/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl @@ -18,7 +18,7 @@ create_table() { local max_nr nr abi sc discard - echo 'static const char *syscalltbl_mips_n64[] = {' + echo 'static const char *const syscalltbl_mips_n64[] = {' while read nr abi sc discard; do printf '\t[%d] = "%s",\n' $nr $sc max_nr=$nr diff --git a/tools/perf/arch/powerpc/entry/syscalls/mksyscalltbl b/tools/perf/arch/powerpc/entry/syscalls/mksyscalltbl index 6c58060aa03b..0eb316fe6dd1 100755 --- a/tools/perf/arch/powerpc/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/powerpc/entry/syscalls/mksyscalltbl @@ -23,7 +23,7 @@ create_table() max_nr=-1 nr=0 - echo "static const char *syscalltbl_powerpc_${wordsize}[] = {" + echo "static const char *const syscalltbl_powerpc_${wordsize}[] = {" while read nr abi sc discard; do if [ "$max_nr" -lt "$nr" ]; then printf '\t[%d] = "%s",\n' $nr $sc diff --git a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl index 72ecbb676370..52eb88a77c94 100755 --- a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl @@ -18,7 +18,7 @@ create_table() { local max_nr nr abi sc discard - echo 'static const char *syscalltbl_s390_64[] = {' + echo 'static const char *const syscalltbl_s390_64[] = {' while read nr abi sc discard; do printf '\t[%d] = "%s",\n' $nr $sc max_nr=$nr diff --git a/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh index 029a72c20b19..fa526a993845 100755 --- a/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh +++ b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh @@ -18,7 +18,7 @@ emit() { syscall_macro "$nr" "$entry" } -echo "static const char *syscalltbl_${arch}[] = {" +echo "static const char *const syscalltbl_${arch}[] = {" sorted_table=$(mktemp /tmp/syscalltbl.XXXXXX) grep '^[0-9]' "$in" | sort -n > $sorted_table diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 313eccef6cb4..63be7b58761d 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -17,31 +17,31 @@ #if defined(__x86_64__) #include const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID; -static const char **syscalltbl_native = syscalltbl_x86_64; +static const char *const *syscalltbl_native = syscalltbl_x86_64; #elif defined(__s390x__) #include const int syscalltbl_native_max_id = SYSCALLTBL_S390_64_MAX_ID; -static const char **syscalltbl_native = syscalltbl_s390_64; +static const char *const *syscalltbl_native = syscalltbl_s390_64; #elif defined(__powerpc64__) #include const int syscalltbl_native_max_id = SYSCALLTBL_POWERPC_64_MAX_ID; -static const char **syscalltbl_native = syscalltbl_powerpc_64; +static const char *const *syscalltbl_native = syscalltbl_powerpc_64; #elif defined(__powerpc__) #include const int syscalltbl_native_max_id = SYSCALLTBL_POWERPC_32_MAX_ID; -static const char **syscalltbl_native = syscalltbl_powerpc_32; +static const char *const *syscalltbl_native = syscalltbl_powerpc_32; #elif defined(__aarch64__) #include const int syscalltbl_native_max_id = SYSCALLTBL_ARM64_MAX_ID; -static const char **syscalltbl_native = syscalltbl_arm64; +static const char *const *syscalltbl_native = syscalltbl_arm64; #elif defined(__mips__) #include const int syscalltbl_native_max_id = SYSCALLTBL_MIPS_N64_MAX_ID; -static const char **syscalltbl_native = syscalltbl_mips_n64; +static const char *const *syscalltbl_native = syscalltbl_mips_n64; #elif defined(__loongarch__) #include const int syscalltbl_native_max_id = SYSCALLTBL_LOONGARCH_MAX_ID; -static const char **syscalltbl_native = syscalltbl_loongarch; +static const char *const *syscalltbl_native = syscalltbl_loongarch; #endif struct syscall { -- cgit v1.2.3 From 0d0db47634611bf25bb933fec801faa91702a3ab Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 May 2023 18:09:58 +0800 Subject: perf arm64: Rename create_table_from_c() to create_sc_table() After commit 9854e7ad35fecf30 ("perf arm64: Simplify mksyscalltbl") it has been removed the temporary C program and used shell to generate syscall table, so let us rename create_table_from_c() to create_sc_table() to avoid confusion. Suggested-by: Leo Yan Reviewed-by: Huacai Chen Signed-off-by: Tiezhu Yang Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/1685441401-8709-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/entry/syscalls/mksyscalltbl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl index 4edcdf6eb8ae..84976dc5bdcf 100755 --- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl @@ -19,7 +19,7 @@ if ! test -r $input; then exit 1 fi -create_table_from_c() +create_sc_table() { local sc nr last_sc @@ -35,7 +35,7 @@ create_table() { echo "#include \"$input\"" echo "static const char *const syscalltbl_arm64[] = {" - create_table_from_c + create_sc_table echo "};" } -- cgit v1.2.3 From d6e1cc6b7220073d6d5d2edd79edf2d36da046bf Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 May 2023 18:09:59 +0800 Subject: perf arm64: Handle __NR3264_ prefixed syscall number After commit 9854e7ad35fe ("perf arm64: Simplify mksyscalltbl"), in the generated syscall table file syscalls.c, there exist some __NR3264_ prefixed syscall numbers such as [__NR3264_ftruncate], it looks like not so good, just do some small filter operations to handle __NR3264_ prefixed syscall number as a digital number. Without this patch: [__NR3264_ftruncate] = "ftruncate", With this patch: [46] = "ftruncate", Suggested-by: Alexander Kapshuk Reviewed-by: Huacai Chen Reviewed-by: Leo Yan Signed-off-by: Tiezhu Yang Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/1685441401-8709-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/entry/syscalls/mksyscalltbl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl index 84976dc5bdcf..0bcd64a74642 100755 --- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl @@ -40,6 +40,7 @@ create_table() } $gcc -E -dM -x c -I $incpath/include/uapi $input \ - |sed -ne 's/^#define __NR_//p' \ - |sort -t' ' -k2 -n \ + |awk '$2 ~ "__NR" && $3 !~ "__NR3264_" { + sub("^#define __NR(3264)?_", ""); + print | "sort -k2 -n"}' \ |create_table -- cgit v1.2.3 From 250e30badf11001a1015ca51a9d9cba2cf34fb97 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 May 2023 18:10:00 +0800 Subject: perf arm64: Use max_nr to define SYSCALLTBL_ARM64_MAX_ID Like x86, powerpc, mips and s390, use max_nr which is a digital number to define SYSCALLTBL_ARM64_MAX_ID. Reviewed-by: Huacai Chen Signed-off-by: Tiezhu Yang Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/1685441401-8709-5-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/entry/syscalls/mksyscalltbl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl index 0bcd64a74642..27d747c92d44 100755 --- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl @@ -21,14 +21,14 @@ fi create_sc_table() { - local sc nr last_sc + local sc nr max_nr while read sc nr; do printf "%s\n" " [$nr] = \"$sc\"," - last_sc=$sc + max_nr=$nr done - printf "%s\n" "#define SYSCALLTBL_ARM64_MAX_ID __NR_$last_sc" + echo "#define SYSCALLTBL_ARM64_MAX_ID $max_nr" } create_table() -- cgit v1.2.3 From 269f49f9cb1e94732ec1738d0b1af4653cadd2f5 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 May 2023 18:10:01 +0800 Subject: perf LoongArch: Simplify mksyscalltbl In order to print the numerical entries of the syscall table, there is no need to call the host compiler to build and then run a program, this can be done directly by the shell script. This is similar with commit 9854e7ad35fe ("perf arm64: Simplify mksyscalltbl"). For now, the mksyscalltbl file of LoongArch is almost same with arm64. Reviewed-by: Huacai Chen Reviewed-by: Leo Yan Signed-off-by: Tiezhu Yang Acked-by: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/1685441401-8709-6-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/loongarch/entry/syscalls/mksyscalltbl | 38 +++++++--------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl b/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl index 5fb83bd87503..c10ad3580aef 100755 --- a/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/loongarch/entry/syscalls/mksyscalltbl @@ -18,44 +18,28 @@ if ! test -r $input; then exit 1 fi -create_table_from_c() +create_sc_table() { - local sc nr last_sc - - create_table_exe=`mktemp ${TMPDIR:-/tmp}/create-table-XXXXXX` - - { - - cat <<-_EoHEADER - #include - #include "$input" - int main(int argc, char *argv[]) - { - _EoHEADER + local sc nr max_nr while read sc nr; do - printf "%s\n" " printf(\"\\t[%d] = \\\"$sc\\\",\\n\", $nr);" - last_sc=$nr + printf "%s\n" " [$nr] = \"$sc\"," + max_nr=$nr done - printf "%s\n" " printf(\"#define SYSCALLTBL_LOONGARCH_MAX_ID %d\\n\", $last_sc);" - printf "}\n" - - } | $hostcc -I $incpath/include/uapi -o $create_table_exe -x c - - - $create_table_exe - - rm -f $create_table_exe + echo "#define SYSCALLTBL_LOONGARCH_MAX_ID $max_nr" } create_table() { + echo "#include \"$input\"" echo "static const char *const syscalltbl_loongarch[] = {" - create_table_from_c + create_sc_table echo "};" } -$gcc -E -dM -x c -I $incpath/include/uapi $input \ - |sed -ne 's/^#define __NR_//p' \ - |sort -t' ' -k2 -n \ +$gcc -E -dM -x c -I $incpath/include/uapi $input \ + |awk '$2 ~ "__NR" && $3 !~ "__NR3264_" { + sub("^#define __NR(3264)?_", ""); + print | "sort -k2 -n"}' \ |create_table -- cgit v1.2.3 From 6f765bbbfb3c8c5993796402a3cba311e9506eed Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 18 May 2023 23:37:18 -0700 Subject: perf expr: Make the evaluation of & and | logical and lazy Currently the & and | operators are only used in metric thresholds like (from the tma_retiring metric): tma_retiring > 0.7 | tma_heavy_operations > 0.1 Thresholds are always computed when present, but a lack of events may mean the threshold can't be computed. This happens with the option --metric-no-threshold for say the metric tma_retiring on Tigerlake model CPUs. To fully compute the threshold tma_heavy_operations is needed and it needs the extra events of IDQ.MS_UOPS, UOPS_DECODED.DEC0, cpu/UOPS_DECODED.DEC0,cmask=1/ and IDQ.MITE_UOPS. So --metric-no-threshold is a useful option to reduce the number of events needed and potentially multiplexing of events. Rather than just fail threshold computations like this, we may know a result from just the left or right-hand side. So, for tma_retiring if its value is "> 0.7" we know it is over the threshold. This allows the metric to have the threshold coloring, when possible, without all the counters being programmed. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Kan Liang Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Andrii Nakryiko Cc: Caleb Biggers Cc: Eduard Zingerman Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Stephane Eranian Cc: Weilin Wang Link: https://lore.kernel.org/r/20230519063719.1029596-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/expr.c | 40 +++++++++++++++++++++++ tools/perf/util/expr.y | 86 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 109 insertions(+), 17 deletions(-) diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 733ead151c63..3d01eb5e2512 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -185,6 +185,46 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u NULL, ctx) == 0); TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 0); + /* The expression is a constant 0.0 without needing to evaluate EVENT1. */ + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("0 & EVENT1 > 0", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 0); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("EVENT1 > 0 & 0", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 0); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("1 & EVENT1 > 0", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); + TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT1", &val_ptr)); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("EVENT1 > 0 & 1", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); + TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT1", &val_ptr)); + + /* The expression is a constant 1.0 without needing to evaluate EVENT1. */ + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("1 | EVENT1 > 0", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 0); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("EVENT1 > 0 | 1", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 0); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("0 | EVENT1 > 0", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); + TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT1", &val_ptr)); + expr__ctx_clear(ctx); + TEST_ASSERT_VAL("find ids", + expr__find_ids("EVENT1 > 0 | 0", NULL, ctx) == 0); + TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1); + TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT1", &val_ptr)); + /* Test toplogy constants appear well ordered. */ expr__ctx_clear(ctx); TEST_ASSERT_VAL("#num_cpus", expr__parse(&num_cpus, ctx, "#num_cpus") == 0); diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y index 4ce931cccb63..f04963eb6be0 100644 --- a/tools/perf/util/expr.y +++ b/tools/perf/util/expr.y @@ -123,20 +123,6 @@ static struct ids handle_id(struct expr_parse_ctx *ctx, char *id, * constant value using OP. Its invariant that there are no ids. If computing * ids for non-constants union the set of IDs that must be computed. */ -#define BINARY_LONG_OP(RESULT, OP, LHS, RHS) \ - if (!compute_ids || (is_const(LHS.val) && is_const(RHS.val))) { \ - assert(LHS.ids == NULL); \ - assert(RHS.ids == NULL); \ - if (isnan(LHS.val) || isnan(RHS.val)) { \ - RESULT.val = NAN; \ - } else { \ - RESULT.val = (long)LHS.val OP (long)RHS.val; \ - } \ - RESULT.ids = NULL; \ - } else { \ - RESULT = union_expr(LHS, RHS); \ - } - #define BINARY_OP(RESULT, OP, LHS, RHS) \ if (!compute_ids || (is_const(LHS.val) && is_const(RHS.val))) { \ assert(LHS.ids == NULL); \ @@ -213,9 +199,75 @@ expr: NUMBER } | ID { $$ = handle_id(ctx, $1, compute_ids, /*source_count=*/false); } | SOURCE_COUNT '(' ID ')' { $$ = handle_id(ctx, $3, compute_ids, /*source_count=*/true); } -| expr '|' expr { BINARY_LONG_OP($$, |, $1, $3); } -| expr '&' expr { BINARY_LONG_OP($$, &, $1, $3); } -| expr '^' expr { BINARY_LONG_OP($$, ^, $1, $3); } +| expr '|' expr +{ + if (is_const($1.val) && is_const($3.val)) { + assert($1.ids == NULL); + assert($3.ids == NULL); + $$.ids = NULL; + $$.val = (fpclassify($1.val) == FP_ZERO && fpclassify($3.val) == FP_ZERO) ? 0 : 1; + } else if (is_const($1.val)) { + assert($1.ids == NULL); + if (fpclassify($1.val) == FP_ZERO) { + $$ = $3; + } else { + $$.val = 1; + $$.ids = NULL; + ids__free($3.ids); + } + } else if (is_const($3.val)) { + assert($3.ids == NULL); + if (fpclassify($3.val) == FP_ZERO) { + $$ = $1; + } else { + $$.val = 1; + $$.ids = NULL; + ids__free($1.ids); + } + } else { + $$ = union_expr($1, $3); + } +} +| expr '&' expr +{ + if (is_const($1.val) && is_const($3.val)) { + assert($1.ids == NULL); + assert($3.ids == NULL); + $$.val = (fpclassify($1.val) != FP_ZERO && fpclassify($3.val) != FP_ZERO) ? 1 : 0; + $$.ids = NULL; + } else if (is_const($1.val)) { + assert($1.ids == NULL); + if (fpclassify($1.val) != FP_ZERO) { + $$ = $3; + } else { + $$.val = 0; + $$.ids = NULL; + ids__free($3.ids); + } + } else if (is_const($3.val)) { + assert($3.ids == NULL); + if (fpclassify($3.val) != FP_ZERO) { + $$ = $1; + } else { + $$.val = 0; + $$.ids = NULL; + ids__free($1.ids); + } + } else { + $$ = union_expr($1, $3); + } +} +| expr '^' expr +{ + if (is_const($1.val) && is_const($3.val)) { + assert($1.ids == NULL); + assert($3.ids == NULL); + $$.val = (fpclassify($1.val) == FP_ZERO) != (fpclassify($3.val) == FP_ZERO) ? 1 : 0; + $$.ids = NULL; + } else { + $$ = union_expr($1, $3); + } +} | expr '<' expr { BINARY_OP($$, <, $1, $3); } | expr '>' expr { BINARY_OP($$, >, $1, $3); } | expr '+' expr { BINARY_OP($$, +, $1, $3); } -- cgit v1.2.3 From e6570967775bbc62a19920bbc4f13bfa73936218 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 18 May 2023 23:37:19 -0700 Subject: perf stat: Document --metric-no-threshold and threshold colors Document the threshold behavior for -M/--metrics. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Andrii Nakryiko Cc: Caleb Biggers Cc: Eduard Zingerman Cc: Edward Baker Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Cc: Stephane Eranian Cc: Weilin Wang Link: https://lore.kernel.org/r/20230519063719.1029596-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 785f0e2bcfac..8f789fa1242e 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -361,6 +361,15 @@ small group that need not have multiplexing is lowered. This option forbids the event merging logic from sharing events between groups and may be used to increase accuracy in this case. +--metric-no-threshold:: +Metric thresholds may increase the number of events necessary to +compute whether a metric has exceeded its threshold expression. This +may not be desirable, for example, as the events can introduce +multiplexing. This option disables the adding of threshold expression +events for a metric. However, if there are sufficient events to +compute the threshold then the threshold is still computed and used to +color the metric's computed value. + --quiet:: Don't print output, warnings or messages. This is useful with perf stat record below to only write data to the perf.data file. @@ -405,6 +414,12 @@ For a group all metrics from the group are added. The events from the metrics are automatically measured. See perf list output for the possible metrics and metricgroups. + When threshold information is available for a metric, the + color red is used to signify a metric has exceeded a threshold + while green shows it hasn't. The default color means that + no threshold information was available or the threshold + couldn't be computed. + -A:: --no-aggr:: Do not aggregate counts across all monitored CPUs. -- cgit v1.2.3 From d0b35979986e3bd03cb2f2e887e0b8036ae06198 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 May 2023 13:50:53 -0700 Subject: perf annotate: Handle x86 instruction suffix generally In AT&T asm syntax, most of x86 instructions can have size suffix like b, w, l or q. Instead of adding all these instructions in the table, we can handle them in a general way. For example, it can try to find an instruction as is. If not found, assuming it has a suffix and it'd try again without the suffix if it's one of the allowed suffixes. This way, we can reduce the instruction table size for duplicated entries of the same instructions with a different suffix. If an instruction xyz and others like xyz are completely different ones, then they both need to be listed in the table so that they can be found before the second attempt (without the suffix). Reviewed-by: Adrian Hunter Reviewed-by: Masami Hiramatsu Signed-off-by: Namhyung Kim Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230524205054.3087004-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index b708bbc49c9e..7f05f2a2aa83 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -70,6 +70,7 @@ struct arch { struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name); bool sorted_instructions; bool initialized; + const char *insn_suffix; void *priv; unsigned int model; unsigned int family; @@ -179,6 +180,7 @@ static struct arch architectures[] = { .init = x86__annotate_init, .instructions = x86__instructions, .nr_instructions = ARRAY_SIZE(x86__instructions), + .insn_suffix = "bwlq", .objdump = { .comment_char = '#', }, @@ -720,6 +722,26 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name) } ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); + if (ins) + return ins->ops; + + if (arch->insn_suffix) { + char tmp[32]; + char suffix; + size_t len = strlen(name); + + if (len == 0 || len >= sizeof(tmp)) + return NULL; + + suffix = name[len - 1]; + if (strchr(arch->insn_suffix, suffix) == NULL) + return NULL; + + strcpy(tmp, name); + tmp[len - 1] = '\0'; /* remove the suffix and check again */ + + ins = bsearch(tmp, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp); + } return ins ? ins->ops : NULL; } -- cgit v1.2.3 From b541a91793fe124b199dc734aa5d7712d2993f06 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 May 2023 13:50:54 -0700 Subject: perf annotate: Remove x86 instructions with suffix Now the suffix is handled in the general code. Let's get rid of them. Reviewed-by: Adrian Hunter Reviewed-by: Masami Hiramatsu Signed-off-by: Namhyung Kim Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230524205054.3087004-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/annotate/instructions.c | 52 ++++++----------------------- 1 file changed, 10 insertions(+), 42 deletions(-) diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 5c7bec25fee4..5f4ac4fc7fcf 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -1,48 +1,37 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * x86 instruction nmemonic table to parse disasm lines for annotate. + * This table is searched twice - one for exact match and another for + * match without a size suffix (b, w, l, q) in case of AT&T syntax. + * + * So this table should not have entries with the suffix unless it's + * a complete different instruction than ones without the suffix. + */ static struct ins x86__instructions[] = { { .name = "adc", .ops = &mov_ops, }, - { .name = "adcb", .ops = &mov_ops, }, - { .name = "adcl", .ops = &mov_ops, }, { .name = "add", .ops = &mov_ops, }, - { .name = "addl", .ops = &mov_ops, }, - { .name = "addq", .ops = &mov_ops, }, { .name = "addsd", .ops = &mov_ops, }, - { .name = "addw", .ops = &mov_ops, }, { .name = "and", .ops = &mov_ops, }, - { .name = "andb", .ops = &mov_ops, }, - { .name = "andl", .ops = &mov_ops, }, { .name = "andpd", .ops = &mov_ops, }, { .name = "andps", .ops = &mov_ops, }, - { .name = "andq", .ops = &mov_ops, }, - { .name = "andw", .ops = &mov_ops, }, { .name = "bsr", .ops = &mov_ops, }, { .name = "bt", .ops = &mov_ops, }, { .name = "btr", .ops = &mov_ops, }, { .name = "bts", .ops = &mov_ops, }, - { .name = "btsq", .ops = &mov_ops, }, { .name = "call", .ops = &call_ops, }, - { .name = "callq", .ops = &call_ops, }, { .name = "cmovbe", .ops = &mov_ops, }, { .name = "cmove", .ops = &mov_ops, }, { .name = "cmovae", .ops = &mov_ops, }, { .name = "cmp", .ops = &mov_ops, }, - { .name = "cmpb", .ops = &mov_ops, }, - { .name = "cmpl", .ops = &mov_ops, }, - { .name = "cmpq", .ops = &mov_ops, }, - { .name = "cmpw", .ops = &mov_ops, }, { .name = "cmpxch", .ops = &mov_ops, }, { .name = "cmpxchg", .ops = &mov_ops, }, { .name = "cs", .ops = &mov_ops, }, { .name = "dec", .ops = &dec_ops, }, - { .name = "decl", .ops = &dec_ops, }, - { .name = "decq", .ops = &dec_ops, }, { .name = "divsd", .ops = &mov_ops, }, { .name = "divss", .ops = &mov_ops, }, { .name = "gs", .ops = &mov_ops, }, { .name = "imul", .ops = &mov_ops, }, { .name = "inc", .ops = &dec_ops, }, - { .name = "incl", .ops = &dec_ops, }, - { .name = "incq", .ops = &dec_ops, }, { .name = "ja", .ops = &jump_ops, }, { .name = "jae", .ops = &jump_ops, }, { .name = "jb", .ops = &jump_ops, }, @@ -56,7 +45,6 @@ static struct ins x86__instructions[] = { { .name = "jl", .ops = &jump_ops, }, { .name = "jle", .ops = &jump_ops, }, { .name = "jmp", .ops = &jump_ops, }, - { .name = "jmpq", .ops = &jump_ops, }, { .name = "jna", .ops = &jump_ops, }, { .name = "jnae", .ops = &jump_ops, }, { .name = "jnb", .ops = &jump_ops, }, @@ -83,49 +71,31 @@ static struct ins x86__instructions[] = { { .name = "mov", .ops = &mov_ops, }, { .name = "movapd", .ops = &mov_ops, }, { .name = "movaps", .ops = &mov_ops, }, - { .name = "movb", .ops = &mov_ops, }, { .name = "movdqa", .ops = &mov_ops, }, { .name = "movdqu", .ops = &mov_ops, }, - { .name = "movl", .ops = &mov_ops, }, - { .name = "movq", .ops = &mov_ops, }, { .name = "movsd", .ops = &mov_ops, }, { .name = "movslq", .ops = &mov_ops, }, { .name = "movss", .ops = &mov_ops, }, { .name = "movupd", .ops = &mov_ops, }, { .name = "movups", .ops = &mov_ops, }, - { .name = "movw", .ops = &mov_ops, }, { .name = "movzbl", .ops = &mov_ops, }, { .name = "movzwl", .ops = &mov_ops, }, { .name = "mulsd", .ops = &mov_ops, }, { .name = "mulss", .ops = &mov_ops, }, { .name = "nop", .ops = &nop_ops, }, - { .name = "nopl", .ops = &nop_ops, }, - { .name = "nopw", .ops = &nop_ops, }, { .name = "or", .ops = &mov_ops, }, - { .name = "orb", .ops = &mov_ops, }, - { .name = "orl", .ops = &mov_ops, }, { .name = "orps", .ops = &mov_ops, }, - { .name = "orq", .ops = &mov_ops, }, { .name = "pand", .ops = &mov_ops, }, { .name = "paddq", .ops = &mov_ops, }, { .name = "pcmpeqb", .ops = &mov_ops, }, { .name = "por", .ops = &mov_ops, }, - { .name = "rclb", .ops = &mov_ops, }, - { .name = "rcll", .ops = &mov_ops, }, + { .name = "rcl", .ops = &mov_ops, }, { .name = "ret", .ops = &ret_ops, }, - { .name = "retq", .ops = &ret_ops, }, { .name = "sbb", .ops = &mov_ops, }, - { .name = "sbbl", .ops = &mov_ops, }, { .name = "sete", .ops = &mov_ops, }, { .name = "sub", .ops = &mov_ops, }, - { .name = "subl", .ops = &mov_ops, }, - { .name = "subq", .ops = &mov_ops, }, { .name = "subsd", .ops = &mov_ops, }, - { .name = "subw", .ops = &mov_ops, }, { .name = "test", .ops = &mov_ops, }, - { .name = "testb", .ops = &mov_ops, }, - { .name = "testl", .ops = &mov_ops, }, - { .name = "testq", .ops = &mov_ops, }, { .name = "tzcnt", .ops = &mov_ops, }, { .name = "ucomisd", .ops = &mov_ops, }, { .name = "ucomiss", .ops = &mov_ops, }, @@ -139,11 +109,9 @@ static struct ins x86__instructions[] = { { .name = "vsubsd", .ops = &mov_ops, }, { .name = "vucomisd", .ops = &mov_ops, }, { .name = "xadd", .ops = &mov_ops, }, - { .name = "xbeginl", .ops = &jump_ops, }, - { .name = "xbeginq", .ops = &jump_ops, }, + { .name = "xbegin", .ops = &jump_ops, }, { .name = "xchg", .ops = &mov_ops, }, { .name = "xor", .ops = &mov_ops, }, - { .name = "xorb", .ops = &mov_ops, }, { .name = "xorpd", .ops = &mov_ops, }, { .name = "xorps", .ops = &mov_ops, }, }; -- cgit v1.2.3 From d6748385098a8333a0e1c7e2d77119c919776728 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 5 Jun 2023 13:34:25 -0700 Subject: tools headers: Make the difference output easier to read Add failures to an array and display it before exiting. Before: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/perf_regs.h' differs from latest version at 'arch/arm64/include/uapi/asm/perf_regs.h' diff -u tools/arch/arm64/include/uapi/asm/perf_regs.h arch/arm64/include/uapi/asm/perf_regs.h Warning: Kernel ABI header at 'tools/arch/arm64/include/asm/cputype.h' differs from latest version at 'arch/arm64/include/asm/cputype.h' diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h ... After: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h diff -u tools/arch/arm64/include/uapi/asm/perf_regs.h arch/arm64/include/uapi/asm/perf_regs.h diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h ... The aim is to make the warnings easier to read and distinguish from other Makefile warnings messages. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230605203425.1696844-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/check-headers.sh | 232 ++++++++++++++++++++++++-------------------- 1 file changed, 128 insertions(+), 104 deletions(-) diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 6f831ee2f60f..a0f1d8adce60 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -1,113 +1,121 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 -FILES=' -include/uapi/linux/const.h -include/uapi/drm/drm.h -include/uapi/drm/i915_drm.h -include/uapi/linux/fadvise.h -include/uapi/linux/fcntl.h -include/uapi/linux/fs.h -include/uapi/linux/fscrypt.h -include/uapi/linux/kcmp.h -include/uapi/linux/kvm.h -include/uapi/linux/in.h -include/uapi/linux/mount.h -include/uapi/linux/openat2.h -include/uapi/linux/perf_event.h -include/uapi/linux/prctl.h -include/uapi/linux/sched.h -include/uapi/linux/stat.h -include/uapi/linux/usbdevice_fs.h -include/uapi/linux/vhost.h -include/uapi/sound/asound.h -include/linux/bits.h -include/vdso/bits.h -include/linux/const.h -include/vdso/const.h -include/linux/hash.h -include/linux/list-sort.h -include/uapi/linux/hw_breakpoint.h -arch/x86/include/asm/disabled-features.h -arch/x86/include/asm/required-features.h -arch/x86/include/asm/cpufeatures.h -arch/x86/include/asm/inat_types.h -arch/x86/include/asm/emulate_prefix.h -arch/x86/include/asm/irq_vectors.h -arch/x86/include/asm/msr-index.h -arch/x86/include/uapi/asm/prctl.h -arch/x86/lib/x86-opcode-map.txt -arch/x86/tools/gen-insn-attr-x86.awk -arch/arm/include/uapi/asm/perf_regs.h -arch/arm64/include/uapi/asm/perf_regs.h -arch/loongarch/include/uapi/asm/perf_regs.h -arch/mips/include/uapi/asm/perf_regs.h -arch/powerpc/include/uapi/asm/perf_regs.h -arch/s390/include/uapi/asm/perf_regs.h -arch/x86/include/uapi/asm/perf_regs.h -arch/x86/include/uapi/asm/kvm.h -arch/x86/include/uapi/asm/kvm_perf.h -arch/x86/include/uapi/asm/svm.h -arch/x86/include/uapi/asm/unistd.h -arch/x86/include/uapi/asm/vmx.h -arch/powerpc/include/uapi/asm/kvm.h -arch/s390/include/uapi/asm/kvm.h -arch/s390/include/uapi/asm/kvm_perf.h -arch/s390/include/uapi/asm/sie.h -arch/arm/include/uapi/asm/kvm.h -arch/arm64/include/uapi/asm/kvm.h -arch/arm64/include/uapi/asm/unistd.h -arch/alpha/include/uapi/asm/errno.h -arch/mips/include/asm/errno.h -arch/mips/include/uapi/asm/errno.h -arch/parisc/include/uapi/asm/errno.h -arch/powerpc/include/uapi/asm/errno.h -arch/sparc/include/uapi/asm/errno.h -arch/x86/include/uapi/asm/errno.h -include/asm-generic/bitops/arch_hweight.h -include/asm-generic/bitops/const_hweight.h -include/asm-generic/bitops/__fls.h -include/asm-generic/bitops/fls.h -include/asm-generic/bitops/fls64.h -include/linux/coresight-pmu.h -include/uapi/asm-generic/errno.h -include/uapi/asm-generic/errno-base.h -include/uapi/asm-generic/ioctls.h -include/uapi/asm-generic/mman-common.h -include/uapi/asm-generic/unistd.h -' - -SYNC_CHECK_FILES=' -arch/x86/include/asm/inat.h -arch/x86/include/asm/insn.h -arch/x86/lib/inat.c -arch/x86/lib/insn.c -' +YELLOW='\033[0;33m' +NC='\033[0m' # No Color + +declare -a FILES +FILES=( + "include/uapi/linux/const.h" + "include/uapi/drm/drm.h" + "include/uapi/drm/i915_drm.h" + "include/uapi/linux/fadvise.h" + "include/uapi/linux/fcntl.h" + "include/uapi/linux/fs.h" + "include/uapi/linux/fscrypt.h" + "include/uapi/linux/kcmp.h" + "include/uapi/linux/kvm.h" + "include/uapi/linux/in.h" + "include/uapi/linux/mount.h" + "include/uapi/linux/openat2.h" + "include/uapi/linux/perf_event.h" + "include/uapi/linux/prctl.h" + "include/uapi/linux/sched.h" + "include/uapi/linux/stat.h" + "include/uapi/linux/usbdevice_fs.h" + "include/uapi/linux/vhost.h" + "include/uapi/sound/asound.h" + "include/linux/bits.h" + "include/vdso/bits.h" + "include/linux/const.h" + "include/vdso/const.h" + "include/linux/hash.h" + "include/linux/list-sort.h" + "include/uapi/linux/hw_breakpoint.h" + "arch/x86/include/asm/disabled-features.h" + "arch/x86/include/asm/required-features.h" + "arch/x86/include/asm/cpufeatures.h" + "arch/x86/include/asm/inat_types.h" + "arch/x86/include/asm/emulate_prefix.h" + "arch/x86/include/asm/irq_vectors.h" + "arch/x86/include/asm/msr-index.h" + "arch/x86/include/uapi/asm/prctl.h" + "arch/x86/lib/x86-opcode-map.txt" + "arch/x86/tools/gen-insn-attr-x86.awk" + "arch/arm/include/uapi/asm/perf_regs.h" + "arch/arm64/include/uapi/asm/perf_regs.h" + "arch/loongarch/include/uapi/asm/perf_regs.h" + "arch/mips/include/uapi/asm/perf_regs.h" + "arch/powerpc/include/uapi/asm/perf_regs.h" + "arch/s390/include/uapi/asm/perf_regs.h" + "arch/x86/include/uapi/asm/perf_regs.h" + "arch/x86/include/uapi/asm/kvm.h" + "arch/x86/include/uapi/asm/kvm_perf.h" + "arch/x86/include/uapi/asm/svm.h" + "arch/x86/include/uapi/asm/unistd.h" + "arch/x86/include/uapi/asm/vmx.h" + "arch/powerpc/include/uapi/asm/kvm.h" + "arch/s390/include/uapi/asm/kvm.h" + "arch/s390/include/uapi/asm/kvm_perf.h" + "arch/s390/include/uapi/asm/sie.h" + "arch/arm/include/uapi/asm/kvm.h" + "arch/arm64/include/uapi/asm/kvm.h" + "arch/arm64/include/uapi/asm/unistd.h" + "arch/alpha/include/uapi/asm/errno.h" + "arch/mips/include/asm/errno.h" + "arch/mips/include/uapi/asm/errno.h" + "arch/parisc/include/uapi/asm/errno.h" + "arch/powerpc/include/uapi/asm/errno.h" + "arch/sparc/include/uapi/asm/errno.h" + "arch/x86/include/uapi/asm/errno.h" + "include/asm-generic/bitops/arch_hweight.h" + "include/asm-generic/bitops/const_hweight.h" + "include/asm-generic/bitops/__fls.h" + "include/asm-generic/bitops/fls.h" + "include/asm-generic/bitops/fls64.h" + "include/linux/coresight-pmu.h" + "include/uapi/asm-generic/errno.h" + "include/uapi/asm-generic/errno-base.h" + "include/uapi/asm-generic/ioctls.h" + "include/uapi/asm-generic/mman-common.h" + "include/uapi/asm-generic/unistd.h" +) + +declare -a SYNC_CHECK_FILES +SYNC_CHECK_FILES=( + "arch/x86/include/asm/inat.h" + "arch/x86/include/asm/insn.h" + "arch/x86/lib/inat.c" + "arch/x86/lib/insn.c" +) # These copies are under tools/perf/trace/beauty/ as they are not used to in # building object files only by scripts in tools/perf/trace/beauty/ to generate # tables that then gets included in .c files for things like id->string syscall # tables (and the reverse lookup as well: string -> id) -BEAUTY_FILES=' -include/linux/socket.h -' +declare -a BEAUTY_FILES +BEAUTY_FILES=( + "include/linux/socket.h" +) + +declare -a FAILURES check_2 () { - file1=$1 - file2=$2 + tools_file=$1 + orig_file=$2 shift shift - cmd="diff $* $file1 $file2 > /dev/null" + cmd="diff $* $tools_file $orig_file > /dev/null" - test -f $file2 && { - eval $cmd || { - echo "Warning: Kernel ABI header at '$file1' differs from latest version at '$file2'" >&2 - echo diff -u $file1 $file2 - } - } + if [ -f "$orig_file" ] && ! eval "$cmd" + then + FAILURES+=( + "$tools_file $orig_file" + ) + fi } check () { @@ -115,7 +123,7 @@ check () { shift - check_2 tools/$file $file $* + check_2 "tools/$file" "$file" $* } beauty_check () { @@ -123,23 +131,29 @@ beauty_check () { shift - check_2 tools/perf/trace/beauty/$file $file $* + check_2 "tools/perf/trace/beauty/$file" "$file" $* } # Check if we have the kernel headers (tools/perf/../../include), else # we're probably on a detached tarball, so no point in trying to check # differences. -test -d ../../include || exit 0 +if ! [ -d ../../include ] +then + echo -e "${YELLOW}Warning${NC}: Skipped check-headers due to missing ../../include" + exit 0 +fi cd ../.. # simple diff check -for i in $FILES; do - check $i -B +for i in "${FILES[@]}" +do + check "$i" -B done -for i in $SYNC_CHECK_FILES; do - check $i '-I "^.*\/\*.*__ignore_sync_check__.*\*\/.*$"' +for i in "${SYNC_CHECK_FILES[@]}" +do + check "$i" '-I "^.*\/\*.*__ignore_sync_check__.*\*\/.*$"' done # diff with extra ignore lines @@ -160,8 +174,9 @@ check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/s check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl check_2 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl -for i in $BEAUTY_FILES; do - beauty_check $i -B +for i in "${BEAUTY_FILES[@]}" +do + beauty_check "$i" -B done # check duplicated library files @@ -169,3 +184,12 @@ check_2 tools/perf/util/hashmap.h tools/lib/bpf/hashmap.h check_2 tools/perf/util/hashmap.c tools/lib/bpf/hashmap.c cd tools/perf + +if [ ${#FAILURES[@]} -gt 0 ] +then + echo -e "${YELLOW}Warning${NC}: Kernel ABI header differences:" + for i in "${FAILURES[@]}" + do + echo " diff -u $i" + done +fi -- cgit v1.2.3 From 36d3e4138e1b6cc9ab179f3f397b5548f8b1eaae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 6 Jun 2023 16:11:10 -0300 Subject: perf script: Fix allocation of evsel->priv related to per-event dump files When printing output we may want to generate per event files, where the --per-event-dump option should be used, creating perf.data.EVENT.dump files instead of printing to stdout. The callback thar processes event thus expects that evsel->priv->fp should point to either the per-event FILE descriptor or to stdout. The a3af66f51bd0bca7 ("perf script: Fix crash because of missing evsel->priv") changeset fixed a case where evsel->priv wasn't setup, thus set to NULL, causing a segfault when trying to access evsel->priv->fp. But it did it for the non --per-event-dump case by allocating a 'struct perf_evsel_script' just to set its ->fp to stdout. Since evsel->priv is only freed when --per-event-dump is used, we ended up with a memory leak, detected using ASAN. Fix it by using the same method as perf_script__setup_per_event_dump(), and reuse that static 'struct perf_evsel_script'. Also check if evsel_script__new() failed. Fixes: a3af66f51bd0bca7 ("perf script: Fix crash because of missing evsel->priv") Reported-by: Ian Rogers Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Ravi Bangoria Link: https://lore.kernel.org/lkml/ZH+F0wGAWV14zvMP@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 70549fc93b12..b02ad386a55b 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2410,6 +2410,9 @@ out_put: return ret; } +// Used when scr->per_event_dump is not set +static struct evsel_script es_stdout; + static int process_attr(struct perf_tool *tool, union perf_event *event, struct evlist **pevlist) { @@ -2418,7 +2421,6 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, struct evsel *evsel, *pos; u64 sample_type; int err; - static struct evsel_script *es; err = perf_event__process_attr(tool, event, pevlist); if (err) @@ -2428,14 +2430,13 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, evsel = evlist__last(*pevlist); if (!evsel->priv) { - if (scr->per_event_dump) { + if (scr->per_event_dump) { evsel->priv = evsel_script__new(evsel, scr->session->data); - } else { - es = zalloc(sizeof(*es)); - if (!es) + if (!evsel->priv) return -ENOMEM; - es->fp = stdout; - evsel->priv = es; + } else { // Replicate what is done in perf_script__setup_per_event_dump() + es_stdout.fp = stdout; + evsel->priv = &es_stdout; } } @@ -2741,7 +2742,6 @@ out_err_fclose: static int perf_script__setup_per_event_dump(struct perf_script *script) { struct evsel *evsel; - static struct evsel_script es_stdout; if (script->per_event_dump) return perf_script__fopen_per_event_dump(script); -- cgit v1.2.3 From f0617f526cb0c482dd46ed798db28d3991f6f872 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 25 May 2023 11:29:02 +0300 Subject: perf parse: Allow config terms with breakpoints Add config terms to the parsing of breakpoint events. Extend "Test event parsing" to also cover using a confg term. This makes breakpoint events consistent with other events which already support config terms. Example: $ cat dr_test.c #include #include void func0(void) { } int main() { printf("func0 %p\n", &func0); while (1) { func0(); usleep(100000); } return 0; } $ gcc -g -O0 -o dr_test dr_test.c $ ./dr_test & [2] 19646 func0 0x55feb98dd169 $ perf record -e mem:0x55feb98dd169:x/name=breakpoint/ -p 19646 -- sleep 0.5 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.017 MB perf.data (5 samples) ] $ perf script dr_test 19646 5632.956628: 1 breakpoint: 55feb98dd169 func0+0x0 (/home/ahunter/git/work/dr_test) dr_test 19646 5633.056866: 1 breakpoint: 55feb98dd169 func0+0x0 (/home/ahunter/git/work/dr_test) dr_test 19646 5633.157084: 1 breakpoint: 55feb98dd169 func0+0x0 (/home/ahunter/git/work/dr_test) dr_test 19646 5633.257309: 1 breakpoint: 55feb98dd169 func0+0x0 (/home/ahunter/git/work/dr_test) dr_test 19646 5633.357532: 1 breakpoint: 55feb98dd169 func0+0x0 (/home/ahunter/git/work/dr_test) $ sudo perf test "Test event parsing" 6: Parse event definition strings : 6.1: Test event parsing : Ok $ sudo perf test -v "Test event parsing" |& grep mem running test 8 'mem:0' running test 9 'mem:0:x' running test 10 'mem:0:r' running test 11 'mem:0:w' running test 19 'mem:0:u' running test 20 'mem:0:x:k' running test 21 'mem:0:r:hp' running test 22 'mem:0:w:up' running test 26 'mem:0:rw' running test 27 'mem:0:rw:kp' running test 42 'mem:0/1' running test 43 'mem:0/2:w' running test 44 'mem:0/4:rw:u' running test 58 'mem:0/name=breakpoint/' running test 59 'mem:0:x/name=breakpoint/' running test 60 'mem:0:r/name=breakpoint/' running test 61 'mem:0:w/name=breakpoint/' running test 62 'mem:0/name=breakpoint/u' running test 63 'mem:0:x/name=breakpoint/k' running test 64 'mem:0:r/name=breakpoint/hp' running test 65 'mem:0:w/name=breakpoint/up' running test 66 'mem:0:rw/name=breakpoint/' running test 67 'mem:0:rw/name=breakpoint/kp' running test 68 'mem:0/1/name=breakpoint/' running test 69 'mem:0/2:w/name=breakpoint/' running test 70 'mem:0/4:rw/name=breakpoint/u' running test 71 'mem:0/1/name=breakpoint1/,mem:0/4:rw/name=breakpoint2/' Committer notes: Folded follow up patch (see 2nd link below) to address warnings about unused tokens: perf tools: Suppress bison unused value warnings Patch "perf tools: Allow config terms with breakpoints" introduced parse tokens for colons and slashes within breakpoint parsing to prevent mix up with colons and slashes related to config terms. The token values are not needed but introduce bison "unused value" warnings. Suppress those warnings. Committer testing: # cat ~acme/c/mem_breakpoint.c #include #include void func1(void) { } void func2(void) { } void func3(void) { } void func4(void) { } void func5(void) { } int main() { printf("func1 %p\n", &func1); printf("func2 %p\n", &func2); printf("func3 %p\n", &func3); printf("func4 %p\n", &func4); printf("func5 %p\n", &func5); while (1) { func1(); func2(); func3(); func4(); func5(); usleep(100000); } return 0; } # ~acme/c/mem_breakpoint & [1] 3186153 func1 0x401136 func2 0x40113d func3 0x401144 func4 0x40114b func5 0x401152 # Trying to watch the first 4 functions for eXecutable access: # perf record -e mem:0x401136:x/name=breakpoint1/,mem:0x40113d:x/name=breakpoint2/,mem:0x401144:x/name=breakpoint3/,mem:0x40114b:x/name=breakpoint4/ -p 3186153 -- sleep 0.5 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.026 MB perf.data (20 samples) ] [root@five ~]# perf script mem_breakpoint 3186153 131612.864793: 1 breakpoint1: 401136 func1+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.864795: 1 breakpoint2: 40113d func2+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.864796: 1 breakpoint3: 401144 func3+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.864797: 1 breakpoint4: 40114b func4+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.964868: 1 breakpoint1: 401136 func1+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.964870: 1 breakpoint2: 40113d func2+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.964871: 1 breakpoint3: 401144 func3+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131612.964872: 1 breakpoint4: 40114b func4+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.064945: 1 breakpoint1: 401136 func1+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.064948: 1 breakpoint2: 40113d func2+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.064948: 1 breakpoint3: 401144 func3+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.064949: 1 breakpoint4: 40114b func4+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.165024: 1 breakpoint1: 401136 func1+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.165026: 1 breakpoint2: 40113d func2+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.165027: 1 breakpoint3: 401144 func3+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.165028: 1 breakpoint4: 40114b func4+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.265103: 1 breakpoint1: 401136 func1+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.265105: 1 breakpoint2: 40113d func2+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.265106: 1 breakpoint3: 401144 func3+0x0 (/var/home/acme/c/mem_breakpoint) mem_breakpoint 3186153 131613.265107: 1 breakpoint4: 40114b func4+0x0 (/var/home/acme/c/mem_breakpoint) # Then all the 5 functions: # perf record -e mem:0x401136:x/name=breakpoint1/,mem:0x40113d:x/name=breakpoint2/,mem:0x401144:x/name=breakpoint3/,mem:0x40114b:x/name=breakpoint4/,mem:0x401152:x/name=breakpoint5/ -p 3186153 -- sleep 0.5 Error: The sys_perf_event_open() syscall returned with 28 (No space left on device) for event (breakpoint5). /bin/dmesg | grep -i perf may provide additional information. # grep -m1 'model name' /proc/cpuinfo model name : AMD Ryzen 9 5950X 16-Core Processor # Reviewed-by: Ian Rogers Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20230525082902.25332-2-adrian.hunter@intel.com Link: https://lore.kernel.org/r/f7228dc9-fe18-a8e3-7d3f-52922e0e1113@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 157 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 23 +++++- tools/perf/util/parse-events.h | 10 ++- tools/perf/util/parse-events.l | 23 +++++- tools/perf/util/parse-events.y | 49 ++++++++----- 5 files changed, 235 insertions(+), 27 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index bba1cd655a1d..133218e51ab4 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -486,6 +486,93 @@ static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist) return test__checkevent_breakpoint_rw(evlist); } +static int test__checkevent_breakpoint_modifier_name(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(evsel__name(evsel), "breakpoint")); + + return test__checkevent_breakpoint(evlist); +} + +static int test__checkevent_breakpoint_x_modifier_name(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(evsel__name(evsel), "breakpoint")); + + return test__checkevent_breakpoint_x(evlist); +} + +static int test__checkevent_breakpoint_r_modifier_name(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(evsel__name(evsel), "breakpoint")); + + return test__checkevent_breakpoint_r(evlist); +} + +static int test__checkevent_breakpoint_w_modifier_name(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(evsel__name(evsel), "breakpoint")); + + return test__checkevent_breakpoint_w(evlist); +} + +static int test__checkevent_breakpoint_rw_modifier_name(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->core.attr.precise_ip); + TEST_ASSERT_VAL("wrong name", + !strcmp(evsel__name(evsel), "breakpoint")); + + return test__checkevent_breakpoint_rw(evlist); +} + +static int test__checkevent_breakpoint_2_events(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + + TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "breakpoint1")); + + evsel = evsel__next(evsel); + + TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong name", !strcmp(evsel__name(evsel), "breakpoint2")); + + return TEST_OK; +} + static int test__checkevent_pmu(struct evlist *evlist) { @@ -1973,6 +2060,76 @@ static const struct evlist_test test__events[] = { .check = test__term_equal_legacy, /* 9 */ }, + { + .name = "mem:0/name=breakpoint/", + .check = test__checkevent_breakpoint, + /* 0 */ + }, + { + .name = "mem:0:x/name=breakpoint/", + .check = test__checkevent_breakpoint_x, + /* 1 */ + }, + { + .name = "mem:0:r/name=breakpoint/", + .check = test__checkevent_breakpoint_r, + /* 2 */ + }, + { + .name = "mem:0:w/name=breakpoint/", + .check = test__checkevent_breakpoint_w, + /* 3 */ + }, + { + .name = "mem:0/name=breakpoint/u", + .check = test__checkevent_breakpoint_modifier_name, + /* 4 */ + }, + { + .name = "mem:0:x/name=breakpoint/k", + .check = test__checkevent_breakpoint_x_modifier_name, + /* 5 */ + }, + { + .name = "mem:0:r/name=breakpoint/hp", + .check = test__checkevent_breakpoint_r_modifier_name, + /* 6 */ + }, + { + .name = "mem:0:w/name=breakpoint/up", + .check = test__checkevent_breakpoint_w_modifier_name, + /* 7 */ + }, + { + .name = "mem:0:rw/name=breakpoint/", + .check = test__checkevent_breakpoint_rw, + /* 8 */ + }, + { + .name = "mem:0:rw/name=breakpoint/kp", + .check = test__checkevent_breakpoint_rw_modifier_name, + /* 9 */ + }, + { + .name = "mem:0/1/name=breakpoint/", + .check = test__checkevent_breakpoint_len, + /* 0 */ + }, + { + .name = "mem:0/2:w/name=breakpoint/", + .check = test__checkevent_breakpoint_len_w, + /* 1 */ + }, + { + .name = "mem:0/4:rw/name=breakpoint/u", + .check = test__checkevent_breakpoint_len_rw_modifier, + /* 2 */ + }, + { + .name = "mem:0/1/name=breakpoint1/,mem:0/4:rw/name=breakpoint2/", + .check = test__checkevent_breakpoint_2_events, + /* 3 */ + }, }; static const struct evlist_test test__events_pmu[] = { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 629f7bd9fd59..2d36cadf35ec 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -946,10 +946,14 @@ do { \ return 0; } -int parse_events_add_breakpoint(struct list_head *list, int *idx, - u64 addr, char *type, u64 len) +int parse_events_add_breakpoint(struct parse_events_state *parse_state, + struct list_head *list, + u64 addr, char *type, u64 len, + struct list_head *head_config __maybe_unused) { struct perf_event_attr attr; + LIST_HEAD(config_terms); + const char *name; memset(&attr, 0, sizeof(attr)); attr.bp_addr = addr; @@ -970,8 +974,19 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx, attr.type = PERF_TYPE_BREAKPOINT; attr.sample_period = 1; - return add_event(list, idx, &attr, /*name=*/NULL, /*mertic_id=*/NULL, - /*config_terms=*/NULL); + if (head_config) { + if (config_attr(&attr, head_config, parse_state->error, + config_term_common)) + return -EINVAL; + + if (get_config_terms(head_config, &config_terms)) + return -ENOMEM; + } + + name = get_config_name(head_config); + + return add_event(list, &parse_state->idx, &attr, name, /*mertic_id=*/NULL, + &config_terms); } static int check_type_val(struct parse_events_term *term, diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 2021fe145410..5fdc1f33f57e 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -185,8 +185,10 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, struct parse_events_state *parse_state, struct list_head *head_config); int parse_events__decode_legacy_cache(const char *name, int pmu_type, __u64 *config); -int parse_events_add_breakpoint(struct list_head *list, int *idx, - u64 addr, char *type, u64 len); +int parse_events_add_breakpoint(struct parse_events_state *parse_state, + struct list_head *list, + u64 addr, char *type, u64 len, + struct list_head *head_config); int parse_events_add_pmu(struct parse_events_state *parse_state, struct list_head *list, char *name, struct list_head *head_config, @@ -226,6 +228,10 @@ void parse_events_error__handle(struct parse_events_error *err, int idx, void parse_events_error__print(struct parse_events_error *err, const char *event); +static inline void parse_events_unused_value(const void *x __maybe_unused) +{ +} + #ifdef HAVE_LIBELF_SUPPORT /* * If the probe point starts with '%', diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 6deb70c25984..7629af3d5c7c 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -190,11 +190,16 @@ name [a-zA-Z_*?\[\]][a-zA-Z0-9_*?.\[\]!\-]* name_tag [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? -/* If you add a modifier you need to update check_modifier() */ +/* + * If you add a modifier you need to update check_modifier(). + * Also, the letters in modifier_event must not be in modifier_bp. + */ modifier_event [ukhpPGHSDIWeb]+ modifier_bp [rwx]{1,3} lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node) lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss) +digit [0-9] +non_digit [^0-9] %% @@ -304,8 +309,20 @@ r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } { {modifier_bp} { return str(yyscanner, PE_MODIFIER_BP); } -: { return ':'; } -"/" { return '/'; } + /* + * The colon before memory access modifiers can get mixed up with the + * colon before event modifiers. Fortunately none of the option letters + * are the same, so trailing context can be used disambiguate the two + * cases. + */ +":"/{modifier_bp} { return str(yyscanner, PE_BP_COLON); } + /* + * The slash before memory length can get mixed up with the slash before + * config terms. Fortunately config terms do not start with a numeric + * digit, so trailing context can be used disambiguate the two cases. + */ +"/"/{digit} { return str(yyscanner, PE_BP_SLASH); } +"/"/{non_digit} { BEGIN(config); return '/'; } {num_dec} { return value(yyscanner, 10); } {num_hex} { return value(yyscanner, 16); } /* diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index f96afb0edd0c..0c3d086cc22a 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -59,7 +59,7 @@ static void free_list_evsel(struct list_head* list_evsel) %token PE_EVENT_NAME %token PE_RAW PE_NAME %token PE_BPF_OBJECT PE_BPF_SOURCE -%token PE_MODIFIER_EVENT PE_MODIFIER_BP +%token PE_MODIFIER_EVENT PE_MODIFIER_BP PE_BP_COLON PE_BP_SLASH %token PE_LEGACY_CACHE %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP %token PE_ERROR @@ -80,6 +80,8 @@ static void free_list_evsel(struct list_head* list_evsel) %type PE_LEGACY_CACHE %type PE_MODIFIER_EVENT %type PE_MODIFIER_BP +%type PE_BP_COLON +%type PE_BP_SLASH %type PE_EVENT_NAME %type PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type PE_DRV_CFG_TERM @@ -275,7 +277,7 @@ event_def event_def: event_pmu | event_legacy_symbol | event_legacy_cache sep_dc | - event_legacy_mem | + event_legacy_mem sep_dc | event_legacy_tracepoint sep_dc | event_legacy_numeric sep_dc | event_legacy_raw sep_dc | @@ -503,16 +505,19 @@ PE_LEGACY_CACHE opt_event_config } event_legacy_mem: -PE_PREFIX_MEM PE_VALUE '/' PE_VALUE ':' PE_MODIFIER_BP sep_dc +PE_PREFIX_MEM PE_VALUE PE_BP_SLASH PE_VALUE PE_BP_COLON PE_MODIFIER_BP opt_event_config { - struct parse_events_state *parse_state = _parse_state; struct list_head *list; int err; + parse_events_unused_value(&$3); + parse_events_unused_value(&$5); + list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_breakpoint(list, &parse_state->idx, - $2, $6, $4); + err = parse_events_add_breakpoint(_parse_state, list, + $2, $6, $4, $7); + parse_events_terms__delete($7); free($6); if (err) { free(list); @@ -521,31 +526,37 @@ PE_PREFIX_MEM PE_VALUE '/' PE_VALUE ':' PE_MODIFIER_BP sep_dc $$ = list; } | -PE_PREFIX_MEM PE_VALUE '/' PE_VALUE sep_dc +PE_PREFIX_MEM PE_VALUE PE_BP_SLASH PE_VALUE opt_event_config { - struct parse_events_state *parse_state = _parse_state; struct list_head *list; + int err; + + parse_events_unused_value(&$3); list = alloc_list(); ABORT_ON(!list); - if (parse_events_add_breakpoint(list, &parse_state->idx, - $2, NULL, $4)) { + err = parse_events_add_breakpoint(_parse_state, list, + $2, NULL, $4, $5); + parse_events_terms__delete($5); + if (err) { free(list); YYABORT; } $$ = list; } | -PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc +PE_PREFIX_MEM PE_VALUE PE_BP_COLON PE_MODIFIER_BP opt_event_config { - struct parse_events_state *parse_state = _parse_state; struct list_head *list; int err; + parse_events_unused_value(&$3); + list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_breakpoint(list, &parse_state->idx, - $2, $4, 0); + err = parse_events_add_breakpoint(_parse_state, list, + $2, $4, 0, $5); + parse_events_terms__delete($5); free($4); if (err) { free(list); @@ -554,15 +565,17 @@ PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc $$ = list; } | -PE_PREFIX_MEM PE_VALUE sep_dc +PE_PREFIX_MEM PE_VALUE opt_event_config { - struct parse_events_state *parse_state = _parse_state; struct list_head *list; + int err; list = alloc_list(); ABORT_ON(!list); - if (parse_events_add_breakpoint(list, &parse_state->idx, - $2, NULL, 0)) { + err = parse_events_add_breakpoint(_parse_state, list, + $2, NULL, 0, $3); + parse_events_terms__delete($3); + if (err) { free(list); YYABORT; } -- cgit v1.2.3 From d1f1cecc92ae0dba44eac3ce10baf4edb4553e41 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Jun 2023 16:23:58 -0700 Subject: perf list: Check if libpfm4 event is supported Some of its event info cannot be used directly due to missing default attributes. Let's check if the event is supported before printing like we do for hw and cache events. Signed-off-by: Namhyung Kim Acked-by: Ian Rogers @google.com> Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230608232400.3056312-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pfm.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/pfm.c b/tools/perf/util/pfm.c index 076aecc22c16..4c1024c343dd 100644 --- a/tools/perf/util/pfm.c +++ b/tools/perf/util/pfm.c @@ -13,6 +13,8 @@ #include "util/pmus.h" #include "util/pfm.h" #include "util/strbuf.h" +#include "util/cpumap.h" +#include "util/thread_map.h" #include #include @@ -123,6 +125,36 @@ error: return -1; } +static bool is_libpfm_event_supported(const char *name, struct perf_cpu_map *cpus, + struct perf_thread_map *threads) +{ + struct perf_pmu *pmu; + struct evsel *evsel; + struct perf_event_attr attr = {}; + bool result = true; + int ret; + + ret = pfm_get_perf_event_encoding(name, PFM_PLM0|PFM_PLM3, + &attr, NULL, NULL); + if (ret != PFM_SUCCESS) + return false; + + pmu = perf_pmus__find_by_type((unsigned int)attr.type); + evsel = parse_events__add_event(0, &attr, name, /*metric_id=*/NULL, pmu); + if (evsel == NULL) + return false; + + evsel->is_libpfm_event = true; + + if (evsel__open(evsel, cpus, threads) < 0) + result = false; + + evsel__close(evsel); + evsel__delete(evsel); + + return result; +} + static const char *srcs[PFM_ATTR_CTRL_MAX] = { [PFM_ATTR_CTRL_UNKNOWN] = "???", [PFM_ATTR_CTRL_PMU] = "PMU", @@ -146,6 +178,8 @@ print_libpfm_event(const struct print_callbacks *print_cb, void *print_state, { int j, ret; char topic[80], name[80]; + struct perf_cpu_map *cpus = perf_cpu_map__empty_new(1); + struct perf_thread_map *threads = thread_map__new_by_tid(0); strbuf_setlen(buf, 0); snprintf(topic, sizeof(topic), "pfm %s", pinfo->name); @@ -185,14 +219,15 @@ print_libpfm_event(const struct print_callbacks *print_cb, void *print_state, ainfo.name, ainfo.desc); } } - print_cb->print_event(print_state, - pinfo->name, - topic, - name, info->equiv, - /*scale_unit=*/NULL, - /*deprecated=*/NULL, "PFM event", - info->desc, /*long_desc=*/NULL, - /*encoding_desc=*/buf->buf); + + if (is_libpfm_event_supported(name, cpus, threads)) { + print_cb->print_event(print_state, pinfo->name, topic, + name, info->equiv, + /*scale_unit=*/NULL, + /*deprecated=*/NULL, "PFM event", + info->desc, /*long_desc=*/NULL, + /*encoding_desc=*/buf->buf); + } pfm_for_each_event_attr(j, info) { pfm_event_attr_info_t ainfo; @@ -215,6 +250,10 @@ print_libpfm_event(const struct print_callbacks *print_cb, void *print_state, print_attr_flags(buf, &ainfo); snprintf(name, sizeof(name), "%s::%s:%s", pinfo->name, info->name, ainfo.name); + + if (!is_libpfm_event_supported(name, cpus, threads)) + continue; + print_cb->print_event(print_state, pinfo->name, topic, @@ -225,6 +264,9 @@ print_libpfm_event(const struct print_callbacks *print_cb, void *print_state, /*encoding_desc=*/buf->buf); } } + + perf_cpu_map__put(cpus); + perf_thread_map__put(threads); } void print_libpfm_events(const struct print_callbacks *print_cb, void *print_state) -- cgit v1.2.3 From cc3d139bca0fdf14d8e5f0e2a3b4132fddebb14b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:23:59 -0700 Subject: perf list: Check arguments to show libpfm4 events This is particularly useful for tests. $ perf list pfm Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230608232400.3056312-3-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-list.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 03b5d26b2489..7fec2cca759f 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -15,6 +15,7 @@ #include "util/pmu.h" #include "util/debug.h" #include "util/metricgroup.h" +#include "util/pfm.h" #include "util/string2.h" #include "util/strlist.h" #include "util/strbuf.h" @@ -457,7 +458,11 @@ int cmd_list(int argc, const char **argv) OPT_END() }; const char * const list_usage[] = { +#ifdef HAVE_LIBPFM + "perf list [] [hw|sw|cache|tracepoint|pmu|sdt|metric|metricgroup|event_glob|pfm]", +#else "perf list [] [hw|sw|cache|tracepoint|pmu|sdt|metric|metricgroup|event_glob]", +#endif NULL }; @@ -539,7 +544,12 @@ int cmd_list(int argc, const char **argv) default_ps.metricgroups = true; default_ps.metrics = false; metricgroup__print(&print_cb, ps); - } else if ((sep = strchr(argv[i], ':')) != NULL) { + } +#ifdef HAVE_LIBPFM + else if (strcmp(argv[i], "pfm") == 0) + print_libpfm_events(&print_cb, ps); +#endif + else if ((sep = strchr(argv[i], ':')) != NULL) { char *old_pmu_glob = default_ps.pmu_glob; default_ps.event_glob = strdup(argv[i]); -- cgit v1.2.3 From dcf7a17714e63d6e38c8c9612fee2dbc2e64c57e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:24:00 -0700 Subject: perf test: Add test of libpfm4 events $ ./perf test -v 102 102: perf all libpfm4 events test : --- start --- test child forked, pid 3030994 Testing ix86arch::UNHALTED_CORE_CYCLES Testing ix86arch::INSTRUCTION_RETIRED Testing ix86arch::UNHALTED_REFERENCE_CYCLES Testing ix86arch::LLC_REFERENCES Testing ix86arch::LLC_MISSES Testing ix86arch::BRANCH_INSTRUCTIONS_RETIRED Testing ix86arch::MISPREDICTED_BRANCH_RETIRED Testing perf_raw::r0000 Testing icl::UNHALTED_CORE_CYCLES Testing icl::UNHALTED_REFERENCE_CYCLES ... test child finished with 0 ---- end ---- perf all libpfm4 events test: Ok Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230608232400.3056312-4-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_all_pfm.sh | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100755 tools/perf/tests/shell/stat_all_pfm.sh diff --git a/tools/perf/tests/shell/stat_all_pfm.sh b/tools/perf/tests/shell/stat_all_pfm.sh new file mode 100755 index 000000000000..4d004f777a6e --- /dev/null +++ b/tools/perf/tests/shell/stat_all_pfm.sh @@ -0,0 +1,51 @@ +#!/bin/sh +# perf all libpfm4 events test +# SPDX-License-Identifier: GPL-2.0 + +if perf version --build-options | grep HAVE_LIBPFM | grep -q OFF +then + echo "Skipping, no libpfm4 support" + exit 2 +fi + +err=0 +for p in $(perf list --raw-dump pfm) +do + if echo "$p" | grep -q unc_ + then + echo "Skipping uncore event '$p' that may require additional options." + continue + fi + echo "Testing $p" + result=$(perf stat --pfm-events "$p" true 2>&1) + x=$? + if echo "$result" | grep -q "failed to parse event $p : invalid or missing unit mask" + then + continue + fi + if test "$x" -ne "0" + then + echo "Unexpected exit code '$x'" + err=1 + fi + if ! echo "$result" | grep -q "$p" && ! echo "$result" | grep -q "" + then + # We failed to see the event and it is supported. Possibly the workload was + # too small so retry with something longer. + result=$(perf stat --pfm-events "$p" perf bench internals synthesize 2>&1) + x=$? + if test "$x" -ne "0" + then + echo "Unexpected exit code '$x'" + err=1 + fi + if ! echo "$result" | grep -q "$p" + then + echo "Event '$p' not printed in:" + echo "$result" + err=1 + fi + fi +done + +exit "$err" -- cgit v1.2.3 From 40826c45eb0b88565b5ea3fa98e6c251b193ad4b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:27:58 -0700 Subject: perf thread: Remove notion of dead threads The dead thread list is best effort. Threads live on it until the reference count hits zero and they are removed. With correct reference counting this should never happen. It is, however, part of the 'perf sched' output that is now removed. If this is an issue we should implement tracking of dead threads in a robust not best-effort way. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 23 +---------------------- tools/perf/util/cs-etm.c | 6 ------ tools/perf/util/intel-pt.c | 8 -------- tools/perf/util/machine.c | 32 +------------------------------- tools/perf/util/thread.c | 25 +------------------------ tools/perf/util/thread.h | 11 +---------- 6 files changed, 4 insertions(+), 101 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index cc4ba506e119..3a30c2ac5b47 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2760,7 +2760,7 @@ struct total_run_stats { u64 total_run_time; }; -static int __show_thread_runtime(struct thread *t, void *priv) +static int show_thread_runtime(struct thread *t, void *priv) { struct total_run_stats *stats = priv; struct thread_runtime *r; @@ -2783,22 +2783,6 @@ static int __show_thread_runtime(struct thread *t, void *priv) return 0; } -static int show_thread_runtime(struct thread *t, void *priv) -{ - if (t->dead) - return 0; - - return __show_thread_runtime(t, priv); -} - -static int show_deadthread_runtime(struct thread *t, void *priv) -{ - if (!t->dead) - return 0; - - return __show_thread_runtime(t, priv); -} - static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node) { const char *sep = " <- "; @@ -2890,11 +2874,6 @@ static void timehist_print_summary(struct perf_sched *sched, if (!task_count) printf("\n"); - printf("\nTerminated tasks:\n"); - machine__for_each_thread(m, show_deadthread_runtime, &totals); - if (task_count == totals.task_count) - printf("\n"); - /* CPU idle stats not tracked when samples were skipped */ if (sched->skipped_samples && !sched->idle_hist) return; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 91299cc56bf7..0f5be4ad24ba 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -3292,12 +3292,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, goto err_free_queues; } - /* - * Initialize list node so that at thread__zput() we can avoid - * segmentation fault at list_del_init(). - */ - INIT_LIST_HEAD(&etm->unknown_thread->node); - err = thread__set_comm(etm->unknown_thread, "unknown", 0); if (err) goto err_delete_thread; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index fe893c9bab3f..dde2ca77a005 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -4311,14 +4311,6 @@ int intel_pt_process_auxtrace_info(union perf_event *event, goto err_free_queues; } - /* - * Since this thread will not be kept in any rbtree not in a - * list, initialize its list node so that at thread__put() the - * current thread lifetime assumption is kept and we don't segfault - * at list_del_init(). - */ - INIT_LIST_HEAD(&pt->unknown_thread->node); - err = thread__set_comm(pt->unknown_thread, "unknown", 0); if (err) goto err_delete_thread; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 9e02e19c1b7a..a1954ac85f59 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -241,17 +241,6 @@ void machine__exit(struct machine *machine) for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; - struct thread *thread, *n; - /* - * Forget about the dead, at this point whatever threads were - * left in the dead lists better have a reference count taken - * by who is using them, and then, when they drop those references - * and it finally hits zero, thread__put() will check and see that - * its not in the dead threads list and will not try to remove it - * from there, just calling thread__delete() straight away. - */ - list_for_each_entry_safe(thread, n, &threads->dead, node) - list_del_init(&thread->node); exit_rwsem(&threads->lock); } @@ -2046,18 +2035,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, rb_erase_cached(&th->rb_node, &threads->entries); RB_CLEAR_NODE(&th->rb_node); --threads->nr; - /* - * Move it first to the dead_threads list, then drop the reference, - * if this is the last reference, then the thread__delete destructor - * will be called and we will remove it from the dead_threads list. - */ - list_add_tail(&th->node, &threads->dead); - /* - * We need to do the put here because if this is the last refcount, - * then we will be touching the threads->dead head when removing the - * thread. - */ thread__put(th); if (lock) @@ -2145,10 +2123,8 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event if (dump_trace) perf_event__fprintf_task(event, stdout); - if (thread != NULL) { - thread__exited(thread); + if (thread != NULL) thread__put(thread); - } return 0; } @@ -3204,12 +3180,6 @@ int machine__for_each_thread(struct machine *machine, if (rc != 0) return rc; } - - list_for_each_entry(thread, &threads->dead, node) { - rc = fn(thread, priv); - if (rc != 0) - return rc; - } } return rc; } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 4b5bdc277baa..d949bffc0ed6 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -125,31 +125,8 @@ struct thread *thread__get(struct thread *thread) void thread__put(struct thread *thread) { - if (thread && refcount_dec_and_test(&thread->refcnt)) { - /* - * Remove it from the dead threads list, as last reference is - * gone, if it is in a dead threads list. - * - * We may not be there anymore if say, the machine where it was - * stored was already deleted, so we already removed it from - * the dead threads and some other piece of code still keeps a - * reference. - * - * This is what 'perf sched' does and finally drops it in - * perf_sched__lat(), where it calls perf_sched__read_events(), - * that processes the events by creating a session and deleting - * it, which ends up destroying the list heads for the dead - * threads, but before it does that it removes all threads from - * it using list_del_init(). - * - * So we need to check here if it is in a dead threads list and - * if so, remove it before finally deleting the thread, to avoid - * an use after free situation. - */ - if (!list_empty(&thread->node)) - list_del_init(&thread->node); + if (thread && refcount_dec_and_test(&thread->refcnt)) thread__delete(thread); - } } static struct namespaces *__thread__namespaces(const struct thread *thread) diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 395c626699a9..86737812e06b 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -30,10 +30,7 @@ struct lbr_stitch { }; struct thread { - union { - struct rb_node rb_node; - struct list_head node; - }; + struct rb_node rb_node; struct maps *maps; pid_t pid_; /* Not all tools update this */ pid_t tid; @@ -43,7 +40,6 @@ struct thread { refcount_t refcnt; bool comm_set; int comm_len; - bool dead; /* if set thread has exited */ struct list_head namespaces_list; struct rw_semaphore namespaces_lock; struct list_head comm_list; @@ -81,11 +77,6 @@ static inline void __thread__zput(struct thread **thread) #define thread__zput(thread) __thread__zput(&thread) -static inline void thread__exited(struct thread *thread) -{ - thread->dead = true; -} - struct namespaces *thread__namespaces(struct thread *thread); int thread__set_namespaces(struct thread *thread, u64 timestamp, struct perf_record_namespaces *event); -- cgit v1.2.3 From 7ee227f674028435c01cb6fa02fa268ae48b1823 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:27:59 -0700 Subject: perf thread: Make threads rbtree non-invasive Separate the rbtree out of thread and into a new struct thread_rb_node. The refcnt is in thread and the rbtree is responsible for a single count. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/util/machine.c | 101 +++++++++++++++++++++++++++++--------------- tools/perf/util/thread.c | 3 -- tools/perf/util/thread.h | 6 ++- 5 files changed, 73 insertions(+), 41 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 92c6797e7cba..c7d526283baf 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -911,7 +911,7 @@ static int tasks_print(struct report *rep, FILE *fp) nd = rb_next(nd)) { task = tasks + itask++; - task->thread = rb_entry(nd, struct thread, rb_node); + task->thread = rb_entry(nd, struct thread_rb_node, rb_node)->thread; INIT_LIST_HEAD(&task->children); INIT_LIST_HEAD(&task->list); thread__set_priv(task->thread, task); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 62c7c99a0fe4..b0dd202d14eb 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4348,7 +4348,7 @@ DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_event struct thread *thread; ) { - entry->thread = rb_entry(nd, struct thread, rb_node); + entry->thread = rb_entry(nd, struct thread_rb_node, rb_node)->thread; } static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a1954ac85f59..cbf092e32ee9 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -43,7 +43,8 @@ #include #include -static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock); +static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd, + struct thread *th, bool lock); static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip); static struct dso *machine__kernel_dso(struct machine *machine) @@ -72,6 +73,21 @@ static void machine__threads_init(struct machine *machine) } } +static int thread_rb_node__cmp_tid(const void *key, const struct rb_node *nd) +{ + int to_find = (int) *((pid_t *)key); + + return to_find - (int)rb_entry(nd, struct thread_rb_node, rb_node)->thread->tid; +} + +static struct thread_rb_node *thread_rb_node__find(const struct thread *th, + struct rb_root *tree) +{ + struct rb_node *nd = rb_find(&th->tid, tree, thread_rb_node__cmp_tid); + + return rb_entry(nd, struct thread_rb_node, rb_node); +} + static int machine__set_mmap_name(struct machine *machine) { if (machine__is_host(machine)) @@ -214,10 +230,10 @@ void machine__delete_threads(struct machine *machine) down_write(&threads->lock); nd = rb_first_cached(&threads->entries); while (nd) { - struct thread *t = rb_entry(nd, struct thread, rb_node); + struct thread_rb_node *trb = rb_entry(nd, struct thread_rb_node, rb_node); nd = rb_next(nd); - __machine__remove_thread(machine, t, false); + __machine__remove_thread(machine, trb, trb->thread, false); } up_write(&threads->lock); } @@ -605,6 +621,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, struct rb_node **p = &threads->entries.rb_root.rb_node; struct rb_node *parent = NULL; struct thread *th; + struct thread_rb_node *nd; bool leftmost = true; th = threads__get_last_match(threads, machine, pid, tid); @@ -613,7 +630,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, while (*p != NULL) { parent = *p; - th = rb_entry(parent, struct thread, rb_node); + th = rb_entry(parent, struct thread_rb_node, rb_node)->thread; if (th->tid == tid) { threads__set_last_match(threads, th); @@ -633,30 +650,39 @@ static struct thread *____machine__findnew_thread(struct machine *machine, return NULL; th = thread__new(pid, tid); - if (th != NULL) { - rb_link_node(&th->rb_node, parent, p); - rb_insert_color_cached(&th->rb_node, &threads->entries, leftmost); + if (th == NULL) + return NULL; - /* - * We have to initialize maps separately after rb tree is updated. - * - * The reason is that we call machine__findnew_thread - * within thread__init_maps to find the thread - * leader and that would screwed the rb tree. - */ - if (thread__init_maps(th, machine)) { - rb_erase_cached(&th->rb_node, &threads->entries); - RB_CLEAR_NODE(&th->rb_node); - thread__put(th); - return NULL; - } - /* - * It is now in the rbtree, get a ref - */ - thread__get(th); - threads__set_last_match(threads, th); - ++threads->nr; + nd = malloc(sizeof(*nd)); + if (nd == NULL) { + thread__put(th); + return NULL; + } + nd->thread = th; + + rb_link_node(&nd->rb_node, parent, p); + rb_insert_color_cached(&nd->rb_node, &threads->entries, leftmost); + + /* + * We have to initialize maps separately after rb tree is updated. + * + * The reason is that we call machine__findnew_thread within + * thread__init_maps to find the thread leader and that would screwed + * the rb tree. + */ + if (thread__init_maps(th, machine)) { + rb_erase_cached(&nd->rb_node, &threads->entries); + RB_CLEAR_NODE(&nd->rb_node); + free(nd); + thread__put(th); + return NULL; } + /* + * It is now in the rbtree, get a ref + */ + thread__get(th); + threads__set_last_match(threads, th); + ++threads->nr; return th; } @@ -1109,7 +1135,7 @@ size_t machine__fprintf(struct machine *machine, FILE *fp) for (nd = rb_first_cached(&threads->entries); nd; nd = rb_next(nd)) { - struct thread *pos = rb_entry(nd, struct thread, rb_node); + struct thread *pos = rb_entry(nd, struct thread_rb_node, rb_node)->thread; ret += thread__fprintf(pos, fp); } @@ -2020,10 +2046,14 @@ out_problem: return 0; } -static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock) +static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd, + struct thread *th, bool lock) { struct threads *threads = machine__threads(machine, th->tid); + if (!nd) + nd = thread_rb_node__find(th, &threads->entries.rb_root); + if (threads->last_match == th) threads__set_last_match(threads, NULL); @@ -2032,11 +2062,12 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, BUG_ON(refcount_read(&th->refcnt) == 0); - rb_erase_cached(&th->rb_node, &threads->entries); - RB_CLEAR_NODE(&th->rb_node); + thread__put(nd->thread); + rb_erase_cached(&nd->rb_node, &threads->entries); + RB_CLEAR_NODE(&nd->rb_node); --threads->nr; - thread__put(th); + free(nd); if (lock) up_write(&threads->lock); @@ -2044,7 +2075,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, void machine__remove_thread(struct machine *machine, struct thread *th) { - return __machine__remove_thread(machine, th, true); + return __machine__remove_thread(machine, NULL, th, true); } int machine__process_fork_event(struct machine *machine, union perf_event *event, @@ -3167,7 +3198,6 @@ int machine__for_each_thread(struct machine *machine, { struct threads *threads; struct rb_node *nd; - struct thread *thread; int rc = 0; int i; @@ -3175,8 +3205,9 @@ int machine__for_each_thread(struct machine *machine, threads = &machine->threads[i]; for (nd = rb_first_cached(&threads->entries); nd; nd = rb_next(nd)) { - thread = rb_entry(nd, struct thread, rb_node); - rc = fn(thread, priv); + struct thread_rb_node *trb = rb_entry(nd, struct thread_rb_node, rb_node); + + rc = fn(trb->thread, priv); if (rc != 0) return rc; } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index d949bffc0ed6..38d300e3e4d3 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -66,7 +66,6 @@ struct thread *thread__new(pid_t pid, pid_t tid) list_add(&comm->list, &thread->comm_list); refcount_set(&thread->refcnt, 1); - RB_CLEAR_NODE(&thread->rb_node); /* Thread holds first ref to nsdata. */ thread->nsinfo = nsinfo__new(pid); srccode_state_init(&thread->srccode_state); @@ -84,8 +83,6 @@ void thread__delete(struct thread *thread) struct namespaces *namespaces, *tmp_namespaces; struct comm *comm, *tmp_comm; - BUG_ON(!RB_EMPTY_NODE(&thread->rb_node)); - thread_stack__free(thread); if (thread->maps) { diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 86737812e06b..3b3f9fb5a916 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -29,8 +29,12 @@ struct lbr_stitch { struct callchain_cursor_node *prev_lbr_cursor; }; +struct thread_rb_node { + struct rb_node rb_node; + struct thread *thread; +}; + struct thread { - struct rb_node rb_node; struct maps *maps; pid_t pid_; /* Not all tools update this */ pid_t tid; -- cgit v1.2.3 From ee84a3032b74055feed192a727e872b0a18d1140 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:00 -0700 Subject: perf thread: Add accessor functions for thread Using accessors will make it easier to add reference count checking in later patches. Committer notes: thread->nsinfo wasn't wrapped as it is used together with nsinfo__zput(), where does a trick to set the field with a refcount being dropped to NULL, and that doesn't work well with using thread__nsinfo(thread), that loses the &thread->nsinfo pointer. When refcount checking is added to 'struct thread', later in this series, nsinfo__zput(RC_CHK_ACCESS(thread)->nsinfo) will be used to check the thread pointer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/tests/dwarf-unwind.c | 2 +- tools/perf/arch/arm64/tests/dwarf-unwind.c | 2 +- tools/perf/arch/powerpc/tests/dwarf-unwind.c | 2 +- tools/perf/arch/x86/tests/dwarf-unwind.c | 2 +- tools/perf/builtin-c2c.c | 6 +- tools/perf/builtin-inject.c | 2 +- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-report.c | 12 +- tools/perf/builtin-sched.c | 51 +++--- tools/perf/builtin-script.c | 20 +-- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 26 +-- .../perf/scripts/python/Perf-Trace-Util/Context.c | 4 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/hists_common.c | 2 +- tools/perf/tests/hists_cumulate.c | 1 - tools/perf/tests/hists_output.c | 2 +- tools/perf/tests/perf-targz-src-pkg | 5 +- tools/perf/tests/thread-maps-share.c | 13 +- tools/perf/trace/beauty/pid.c | 4 +- tools/perf/ui/browsers/hists.c | 19 ++- tools/perf/ui/stdio/hist.c | 2 +- tools/perf/util/arm-spe.c | 4 +- tools/perf/util/cs-etm.c | 2 +- tools/perf/util/data-convert-json.c | 8 +- tools/perf/util/db-export.c | 16 +- tools/perf/util/dlfilter.c | 4 +- tools/perf/util/event.c | 6 +- tools/perf/util/hist.c | 6 +- tools/perf/util/intel-bts.c | 2 +- tools/perf/util/intel-pt.c | 12 +- tools/perf/util/jitdump.c | 10 +- tools/perf/util/machine.c | 91 +++++----- tools/perf/util/map.c | 2 +- tools/perf/util/maps.c | 2 +- .../util/scripting-engines/trace-event-python.c | 14 +- tools/perf/util/session.c | 2 +- tools/perf/util/sort.c | 10 +- tools/perf/util/thread-stack.c | 25 +-- tools/perf/util/thread.c | 161 +++++++++--------- tools/perf/util/thread.h | 188 ++++++++++++++++++++- tools/perf/util/unwind-libdw.c | 6 +- tools/perf/util/unwind-libunwind-local.c | 6 +- tools/perf/util/unwind-libunwind.c | 2 +- tools/perf/util/vdso.c | 2 +- 45 files changed, 485 insertions(+), 279 deletions(-) diff --git a/tools/perf/arch/arm/tests/dwarf-unwind.c b/tools/perf/arch/arm/tests/dwarf-unwind.c index 566fb6c0eae7..9bc304cb7762 100644 --- a/tools/perf/arch/arm/tests/dwarf-unwind.c +++ b/tools/perf/arch/arm/tests/dwarf-unwind.c @@ -26,7 +26,7 @@ static int sample_ustack(struct perf_sample *sample, sp = (unsigned long) regs[PERF_REG_ARM_SP]; - map = maps__find(thread->maps, (u64)sp); + map = maps__find(thread__maps(thread), (u64)sp); if (!map) { pr_debug("failed to get stack map\n"); free(buf); diff --git a/tools/perf/arch/arm64/tests/dwarf-unwind.c b/tools/perf/arch/arm64/tests/dwarf-unwind.c index 90a7ef293ce7..b2603d0d3737 100644 --- a/tools/perf/arch/arm64/tests/dwarf-unwind.c +++ b/tools/perf/arch/arm64/tests/dwarf-unwind.c @@ -26,7 +26,7 @@ static int sample_ustack(struct perf_sample *sample, sp = (unsigned long) regs[PERF_REG_ARM64_SP]; - map = maps__find(thread->maps, (u64)sp); + map = maps__find(thread__maps(thread), (u64)sp); if (!map) { pr_debug("failed to get stack map\n"); free(buf); diff --git a/tools/perf/arch/powerpc/tests/dwarf-unwind.c b/tools/perf/arch/powerpc/tests/dwarf-unwind.c index 32fffb593fbf..5ecf82893b84 100644 --- a/tools/perf/arch/powerpc/tests/dwarf-unwind.c +++ b/tools/perf/arch/powerpc/tests/dwarf-unwind.c @@ -26,7 +26,7 @@ static int sample_ustack(struct perf_sample *sample, sp = (unsigned long) regs[PERF_REG_POWERPC_R1]; - map = maps__find(thread->maps, (u64)sp); + map = maps__find(thread__maps(thread), (u64)sp); if (!map) { pr_debug("failed to get stack map\n"); free(buf); diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c index 497593be80f2..5bfec3345d59 100644 --- a/tools/perf/arch/x86/tests/dwarf-unwind.c +++ b/tools/perf/arch/x86/tests/dwarf-unwind.c @@ -26,7 +26,7 @@ static int sample_ustack(struct perf_sample *sample, sp = (unsigned long) regs[PERF_REG_X86_SP]; - map = maps__find(thread->maps, (u64)sp); + map = maps__find(thread__maps(thread), (u64)sp); if (!map) { pr_debug("failed to get stack map\n"); free(buf); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 05dfd98af170..ee41a96f0c73 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -293,7 +293,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, } if (c2c.stitch_lbr) - al.thread->lbr_stitch_enable = true; + thread__set_lbr_stitch_enable(al.thread, true); ret = sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, sysctl_perf_event_max_stack); @@ -1149,14 +1149,14 @@ pid_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, { int width = c2c_width(fmt, hpp, he->hists); - return scnprintf(hpp->buf, hpp->size, "%*d", width, he->thread->pid_); + return scnprintf(hpp->buf, hpp->size, "%*d", width, thread__pid(he->thread)); } static int64_t pid_cmp(struct perf_hpp_fmt *fmt __maybe_unused, struct hist_entry *left, struct hist_entry *right) { - return left->thread->pid_ - right->thread->pid_; + return thread__pid(left->thread) - thread__pid(right->thread); } static int64_t diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 61766eead4f4..d9e96d4624c6 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -417,7 +417,7 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, } vdso = is_vdso_map(filename); - nsi = nsinfo__get(thread->nsinfo); + nsi = nsinfo__get(thread__nsinfo(thread)); if (vdso) { /* The vdso maps are always on the host and not the diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 2150eeced892..fe9439a4fd66 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -964,7 +964,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, if (perf_kmem__skip_sample(sample)) return 0; - dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid); + dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread__tid(thread)); if (evsel->handler != NULL) { tracepoint_handler f = evsel->handler; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c7d526283baf..8ea6ab18534a 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -292,7 +292,7 @@ static int process_sample_event(struct perf_tool *tool, } if (rep->stitch_lbr) - al.thread->lbr_stitch_enable = true; + thread__set_lbr_stitch_enable(al.thread, true); if (symbol_conf.hide_unresolved && al.sym == NULL) goto out_put; @@ -829,10 +829,10 @@ static struct task *tasks_list(struct task *task, struct machine *machine) return NULL; /* Last one in the chain. */ - if (thread->ppid == -1) + if (thread__ppid(thread) == -1) return task; - parent_thread = machine__find_thread(machine, -1, thread->ppid); + parent_thread = machine__find_thread(machine, -1, thread__ppid(thread)); if (!parent_thread) return ERR_PTR(-ENOENT); @@ -869,12 +869,12 @@ static void task__print_level(struct task *task, FILE *fp, int level) struct thread *thread = task->thread; struct task *child; int comm_indent = fprintf(fp, " %8d %8d %8d |%*s", - thread->pid_, thread->tid, thread->ppid, - level, ""); + thread__pid(thread), thread__tid(thread), + thread__ppid(thread), level, ""); fprintf(fp, "%s\n", thread__comm_str(thread)); - maps__fprintf_task(thread->maps, comm_indent, fp); + maps__fprintf_task(thread__maps(thread), comm_indent, fp); if (!list_empty(&task->children)) { list_for_each_entry(child, &task->children, list) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 3a30c2ac5b47..fd37468c4f62 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -916,12 +916,12 @@ static int replay_fork_event(struct perf_sched *sched, if (verbose > 0) { printf("fork event\n"); - printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid); - printf("... child: %s/%d\n", thread__comm_str(child), child->tid); + printf("... parent: %s/%d\n", thread__comm_str(parent), thread__tid(parent)); + printf("... child: %s/%d\n", thread__comm_str(child), thread__tid(child)); } - register_pid(sched, parent->tid, thread__comm_str(parent)); - register_pid(sched, child->tid, thread__comm_str(child)); + register_pid(sched, thread__tid(parent), thread__comm_str(parent)); + register_pid(sched, thread__tid(child), thread__comm_str(child)); out_put: thread__put(child); thread__put(parent); @@ -1316,7 +1316,7 @@ static int latency_migrate_task_event(struct perf_sched *sched, if (!atoms) { if (thread_atoms_insert(sched, migrant)) goto out_put; - register_pid(sched, migrant->tid, thread__comm_str(migrant)); + register_pid(sched, thread__tid(migrant), thread__comm_str(migrant)); atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); if (!atoms) { pr_err("migration-event: Internal tree error"); @@ -1359,10 +1359,13 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_ sched->all_runtime += work_list->total_runtime; sched->all_count += work_list->nb_atoms; - if (work_list->num_merged > 1) - ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged); - else - ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid); + if (work_list->num_merged > 1) { + ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), + work_list->num_merged); + } else { + ret = printf(" %s:%d ", thread__comm_str(work_list->thread), + thread__tid(work_list->thread)); + } for (i = 0; i < 24 - ret; i++) printf(" "); @@ -1380,11 +1383,15 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_ static int pid_cmp(struct work_atoms *l, struct work_atoms *r) { + pid_t l_tid, r_tid; + if (l->thread == r->thread) return 0; - if (l->thread->tid < r->thread->tid) + l_tid = thread__tid(l->thread); + r_tid = thread__tid(r->thread); + if (l_tid < r_tid) return -1; - if (l->thread->tid > r->thread->tid) + if (l_tid > r_tid) return 1; return (int)(l->thread - r->thread); } @@ -1679,14 +1686,14 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel, timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp)); color_fprintf(stdout, color, " %12s secs ", stimestamp); - if (new_shortname || tr->comm_changed || (verbose > 0 && sched_in->tid)) { + if (new_shortname || tr->comm_changed || (verbose > 0 && thread__tid(sched_in))) { const char *pid_color = color; if (thread__has_color(sched_in)) pid_color = COLOR_PIDS; color_fprintf(stdout, pid_color, "%s => %s:%d", - tr->shortname, thread__comm_str(sched_in), sched_in->tid); + tr->shortname, thread__comm_str(sched_in), thread__tid(sched_in)); tr->comm_changed = false; } @@ -1948,8 +1955,8 @@ static char *timehist_get_commstr(struct thread *thread) { static char str[32]; const char *comm = thread__comm_str(thread); - pid_t tid = thread->tid; - pid_t pid = thread->pid_; + pid_t tid = thread__tid(thread); + pid_t pid = thread__pid(thread); int n; if (pid == 0) @@ -2032,7 +2039,7 @@ static char task_state_char(struct thread *thread, int state) unsigned bit = state ? ffs(state) : 0; /* 'I' for idle */ - if (thread->tid == 0) + if (thread__tid(thread) == 0) return 'I'; return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; @@ -2067,7 +2074,7 @@ static void timehist_print_sample(struct perf_sched *sched, for (i = 0; i < max_cpus; ++i) { /* flag idle times with 'i'; others are sched events */ if (i == sample->cpu) - c = (thread->tid == 0) ? 'i' : 's'; + c = (thread__tid(thread) == 0) ? 'i' : 's'; else c = ' '; printf("%c", c); @@ -2094,7 +2101,7 @@ static void timehist_print_sample(struct perf_sched *sched, if (sched->show_wakeups && !sched->show_next) printf(" %-*s", comm_width, ""); - if (thread->tid == 0) + if (thread__tid(thread) == 0) goto out; if (sched->show_callchain) @@ -2626,7 +2633,7 @@ static int timehist_sched_change_event(struct perf_tool *tool, t = ptime->end; } - if (!sched->idle_hist || thread->tid == 0) { + if (!sched->idle_hist || thread__tid(thread) == 0) { if (!cpu_list || test_bit(sample->cpu, cpu_bitmap)) timehist_update_runtime_stats(tr, t, tprev); @@ -2634,7 +2641,7 @@ static int timehist_sched_change_event(struct perf_tool *tool, struct idle_thread_runtime *itr = (void *)tr; struct thread_runtime *last_tr; - BUG_ON(thread->tid != 0); + BUG_ON(thread__tid(thread) != 0); if (itr->last_thread == NULL) goto out; @@ -2719,7 +2726,7 @@ static void print_thread_runtime(struct thread *t, float stddev; printf("%*s %5d %9" PRIu64 " ", - comm_width, timehist_get_commstr(t), t->ppid, + comm_width, timehist_get_commstr(t), thread__ppid(t), (u64) r->run_stats.n); print_sched_time(r->total_run_time, 8); @@ -2739,7 +2746,7 @@ static void print_thread_waittime(struct thread *t, struct thread_runtime *r) { printf("%*s %5d %9" PRIu64 " ", - comm_width, timehist_get_commstr(t), t->ppid, + comm_width, timehist_get_commstr(t), thread__ppid(t), (u64) r->run_stats.n); print_sched_time(r->total_run_time, 8); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b02ad386a55b..e756290de2ac 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1142,7 +1142,7 @@ static int print_srccode(struct thread *thread, u8 cpumode, uint64_t addr) if (!al.map) return 0; ret = map__fprintf_srccode(al.map, al.addr, stdout, - &thread->srccode_state); + thread__srccode_state(thread)); if (ret) ret += printf("\n"); return ret; @@ -1439,7 +1439,7 @@ static int perf_sample__fprintf_callindent(struct perf_sample *sample, * The 'return' has already been popped off the stack so the depth has * to be adjusted to match the 'call'. */ - if (thread->ts && sample->flags & PERF_IP_FLAG_RETURN) + if (thread__ts(thread) && sample->flags & PERF_IP_FLAG_RETURN) depth += 1; name = resolve_branch_sym(sample, evsel, thread, al, addr_al, &ip); @@ -1577,7 +1577,7 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample, printed += fprintf(fp, "\n"); if (PRINT_FIELD(SRCCODE)) { int ret = map__fprintf_srccode(al->map, al->addr, stdout, - &thread->srccode_state); + thread__srccode_state(thread)); if (ret) { printed += ret; printed += printf("\n"); @@ -2086,9 +2086,9 @@ static bool show_event(struct perf_sample *sample, if (!symbol_conf.graph_function) return true; - if (thread->filter) { - if (depth <= thread->filter_entry_depth) { - thread->filter = false; + if (thread__filter(thread)) { + if (depth <= thread__filter_entry_depth(thread)) { + thread__set_filter(thread, false); return false; } return true; @@ -2105,8 +2105,8 @@ static bool show_event(struct perf_sample *sample, while (*s) { unsigned len = strcspn(s, ","); if (nlen == len && !strncmp(name, s, len)) { - thread->filter = true; - thread->filter_entry_depth = depth; + thread__set_filter(thread, true); + thread__set_filter_entry_depth(thread, depth); return true; } s += len; @@ -2186,7 +2186,7 @@ static void process_event(struct perf_script *script, struct callchain_cursor *cursor = NULL; if (script->stitch_lbr) - al->thread->lbr_stitch_enable = true; + thread__set_lbr_stitch_enable(al->thread, true); if (symbol_conf.use_callchain && sample->callchain && thread__resolve_callchain(al->thread, &callchain_cursor, evsel, @@ -2241,7 +2241,7 @@ static void process_event(struct perf_script *script, if (PRINT_FIELD(SRCCODE)) { if (map__fprintf_srccode(al->map, al->addr, stdout, - &thread->srccode_state)) + thread__srccode_state(thread))) printf("\n"); } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 27a7f068207d..9d3cbebb9b79 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -777,7 +777,7 @@ static void perf_event__process_sample(struct perf_tool *tool, return; if (top->stitch_lbr) - al.thread->lbr_stitch_enable = true; + thread__set_lbr_stitch_enable(al.thread, true); if (!machine->kptr_restrict_warned && symbol_conf.kptr_restrict && diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index b0dd202d14eb..4c9bec39423b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1386,12 +1386,13 @@ static int thread__read_fd_path(struct thread *thread, int fd) struct stat st; int ret; - if (thread->pid_ == thread->tid) { + if (thread__pid(thread) == thread__tid(thread)) { scnprintf(linkname, sizeof(linkname), - "/proc/%d/fd/%d", thread->pid_, fd); + "/proc/%d/fd/%d", thread__pid(thread), fd); } else { scnprintf(linkname, sizeof(linkname), - "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); + "/proc/%d/task/%d/fd/%d", + thread__pid(thread), thread__tid(thread), fd); } if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) @@ -1559,7 +1560,7 @@ static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread if (trace->multiple_threads) { if (trace->show_comm) printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); - printed += fprintf(fp, "%d ", thread->tid); + printed += fprintf(fp, "%d ", thread__tid(thread)); } return printed; @@ -2205,7 +2206,8 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32)); } else { pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n", - thread__comm_str(thread), thread->pid_, thread->tid); + thread__comm_str(thread), thread__pid(thread), + thread__tid(thread)); return; } @@ -2550,7 +2552,7 @@ errno_print: { if (child != NULL) { fprintf(trace->output, "%ld", ret); - if (child->comm_set) + if (thread__comm_set(child)) fprintf(trace->output, " (%s)", thread__comm_str(child)); thread__put(child); } @@ -3616,14 +3618,16 @@ static int trace__set_filter_loop_pids(struct trace *trace) struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); while (thread && nr < ARRAY_SIZE(pids)) { - struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); + struct thread *parent = machine__find_thread(trace->host, + thread__ppid(thread), + thread__ppid(thread)); if (parent == NULL) break; if (!strcmp(thread__comm_str(parent), "sshd") || strstarts(thread__comm_str(parent), "gnome-terminal")) { - pids[nr++] = parent->tid; + pids[nr++] = thread__tid(parent); break; } thread = parent; @@ -4322,7 +4326,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; - printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); + printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread)); printed += fprintf(fp, "%lu events, ", ttrace->nr_events); printed += fprintf(fp, "%.1f%%", ratio); if (ttrace->pfmaj) @@ -4344,7 +4348,9 @@ static unsigned long thread__nr_events(struct thread_trace *ttrace) return ttrace ? ttrace->nr_events : 0; } -DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), +DEFINE_RESORT_RB(threads, + (thread__nr_events(thread__priv(a->thread)) < + thread__nr_events(thread__priv(b->thread))), struct thread *thread; ) { diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c index 53b1587db403..3954bd1587ce 100644 --- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c +++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c @@ -100,8 +100,8 @@ static PyObject *perf_sample_insn(PyObject *obj, PyObject *args) if (!c) return NULL; - if (c->sample->ip && !c->sample->insn_len && c->al->thread->maps) { - struct machine *machine = maps__machine(c->al->thread->maps); + if (c->sample->ip && !c->sample->insn_len && thread__maps(c->al->thread)) { + struct machine *machine = maps__machine(thread__maps(c->al->thread)); script_fetch_insn(c->sample, c->al->thread, machine); } diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index efe026a35010..9d8eefbebd48 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -269,7 +269,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, len = map__end(al.map) - addr; /* Read the object code using perf */ - ret_len = dso__data_read_offset(dso, maps__machine(thread->maps), + ret_len = dso__data_read_offset(dso, maps__machine(thread__maps(thread)), al.addr, buf1, len); if (ret_len != len) { pr_debug("dso__data_read_offset failed\n"); diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c index 745ab18d17db..d08add0f4da6 100644 --- a/tools/perf/tests/hists_common.c +++ b/tools/perf/tests/hists_common.c @@ -211,7 +211,7 @@ void print_hists_out(struct hists *hists) struct dso *dso = map__dso(he->ms.map); pr_info("%2d: entry: %8s:%5d [%-8s] %20s: period = %"PRIu64"/%"PRIu64"\n", - i, thread__comm_str(he->thread), he->thread->tid, + i, thread__comm_str(he->thread), thread__tid(he->thread), dso->short_name, he->ms.sym->name, he->stat.period, he->stat_acc ? he->stat_acc->period : 0); diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 8c0e3f334747..62b9c6461ea6 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -162,7 +162,6 @@ typedef int (*test_fn_t)(struct evsel *, struct machine *); #define DSO(he) (map__dso(he->ms.map)->short_name) #define SYM(he) (he->ms.sym->name) #define CPU(he) (he->cpu) -#define PID(he) (he->thread->tid) #define DEPTH(he) (he->callchain->max_depth) #define CDSO(cl) (map__dso(cl->ms.map)->short_name) #define CSYM(cl) (cl->ms.sym->name) diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index cebd5226bb12..cd2094c13e1e 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -128,7 +128,7 @@ typedef int (*test_fn_t)(struct evsel *, struct machine *); #define DSO(he) (map__dso(he->ms.map)->short_name) #define SYM(he) (he->ms.sym->name) #define CPU(he) (he->cpu) -#define PID(he) (he->thread->tid) +#define PID(he) (thread__tid(he->thread)) /* default sort keys (no field) */ static int test1(struct evsel *evsel, struct machine *machine) diff --git a/tools/perf/tests/perf-targz-src-pkg b/tools/perf/tests/perf-targz-src-pkg index fae26b1cf08f..b3075c168cb2 100755 --- a/tools/perf/tests/perf-targz-src-pkg +++ b/tools/perf/tests/perf-targz-src-pkg @@ -7,16 +7,17 @@ # be in such tarball, which sometimes gets broken when we move files around, # like when we made some files that were in tools/perf/ available to other tools/ # codebases by moving it to tools/include/, etc. +set -e PERF=$1 cd ${PERF}/../.. -make perf-targz-src-pkg > /dev/null +make perf-targz-src-pkg TARBALL=$(ls -rt perf-*.tar.gz) TMP_DEST=$(mktemp -d) tar xf ${TARBALL} -C $TMP_DEST rm -f ${TARBALL} cd - > /dev/null -make -C $TMP_DEST/perf*/tools/perf > /dev/null +make -C $TMP_DEST/perf*/tools/perf RC=$? rm -rf ${TMP_DEST} exit $RC diff --git a/tools/perf/tests/thread-maps-share.c b/tools/perf/tests/thread-maps-share.c index 858e725318a9..faf980b26252 100644 --- a/tools/perf/tests/thread-maps-share.c +++ b/tools/perf/tests/thread-maps-share.c @@ -42,13 +42,13 @@ static int test__thread_maps_share(struct test_suite *test __maybe_unused, int s TEST_ASSERT_VAL("failed to create threads", leader && t1 && t2 && t3 && other); - maps = leader->maps; + maps = thread__maps(leader); TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(maps__refcnt(maps)), 4); /* test the maps pointer is shared */ - TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(maps) == RC_CHK_ACCESS(t1->maps)); - TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(maps) == RC_CHK_ACCESS(t2->maps)); - TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(maps) == RC_CHK_ACCESS(t3->maps)); + TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(maps) == RC_CHK_ACCESS(thread__maps(t1))); + TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(maps) == RC_CHK_ACCESS(thread__maps(t2))); + TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(maps) == RC_CHK_ACCESS(thread__maps(t3))); /* * Verify the other leader was created by previous call. @@ -70,10 +70,11 @@ static int test__thread_maps_share(struct test_suite *test __maybe_unused, int s machine__remove_thread(machine, other); machine__remove_thread(machine, other_leader); - other_maps = other->maps; + other_maps = thread__maps(other); TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(maps__refcnt(other_maps)), 2); - TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(other_maps) == RC_CHK_ACCESS(other_leader->maps)); + TEST_ASSERT_VAL("maps don't match", RC_CHK_ACCESS(other_maps) == + RC_CHK_ACCESS(thread__maps(other_leader))); /* release thread group */ thread__put(t3); diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c index 1a6acc46807b..8f9c9950f8ba 100644 --- a/tools/perf/trace/beauty/pid.c +++ b/tools/perf/trace/beauty/pid.c @@ -8,10 +8,10 @@ size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg struct thread *thread = machine__findnew_thread(trace->host, pid, pid); if (thread != NULL) { - if (!thread->comm_set) + if (!thread__comm_set(thread)) thread__set_comm_from_proc(thread); - if (thread->comm_set) + if (thread__comm_set(thread)) printed += scnprintf(bf + printed, size - printed, " (%s)", thread__comm_str(thread)); thread__put(thread); diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 69c81759a64f..c7ad9e003080 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -2533,13 +2533,15 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act) thread__zput(browser->hists->thread_filter); ui_helpline__pop(); } else { + const char *comm_set_str = + thread__comm_set(thread) ? thread__comm_str(thread) : ""; + if (hists__has(browser->hists, thread)) { ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"", - thread->comm_set ? thread__comm_str(thread) : "", - thread->tid); + comm_set_str, thread__tid(thread)); } else { ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s thread\"", - thread->comm_set ? thread__comm_str(thread) : ""); + comm_set_str); } browser->hists->thread_filter = thread__get(thread); @@ -2557,20 +2559,19 @@ add_thread_opt(struct hist_browser *browser, struct popup_action *act, char **optstr, struct thread *thread) { int ret; + const char *comm_set_str, *in_out; if ((!hists__has(browser->hists, thread) && !hists__has(browser->hists, comm)) || thread == NULL) return 0; + in_out = browser->hists->thread_filter ? "out of" : "into"; + comm_set_str = thread__comm_set(thread) ? thread__comm_str(thread) : ""; if (hists__has(browser->hists, thread)) { ret = asprintf(optstr, "Zoom %s %s(%d) thread", - browser->hists->thread_filter ? "out of" : "into", - thread->comm_set ? thread__comm_str(thread) : "", - thread->tid); + in_out, comm_set_str, thread__tid(thread)); } else { - ret = asprintf(optstr, "Zoom %s %s thread", - browser->hists->thread_filter ? "out of" : "into", - thread->comm_set ? thread__comm_str(thread) : ""); + ret = asprintf(optstr, "Zoom %s %s thread", in_out, comm_set_str); } if (ret < 0) return 0; diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index f36270485168..b849caace398 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -885,7 +885,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, } if (h->ms.map == NULL && verbose > 1) { - maps__fprintf(h->thread->maps, fp); + maps__fprintf(thread__maps(h->thread), fp); fprintf(fp, "%.10s end\n", graph_dotted_line); } } diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 7b36ba6b4079..afbd5869f6bf 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -254,9 +254,9 @@ static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe, } if (speq->thread) { - speq->pid = speq->thread->pid_; + speq->pid = thread__pid(speq->thread); if (queue->cpu == -1) - speq->cpu = speq->thread->cpu; + speq->cpu = thread__cpu(speq->thread); } } diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 0f5be4ad24ba..b550c7393155 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1311,7 +1311,7 @@ static void cs_etm__set_pid_tid_cpu(struct cs_etm_auxtrace *etm, tidq->tid); if (tidq->thread) - tidq->pid = tidq->thread->pid_; + tidq->pid = thread__pid(tidq->thread); } int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 653709ab867a..291591e303cd 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -172,13 +172,13 @@ static int process_sample_event(struct perf_tool *tool, output_json_format(out, false, 2, "{"); output_json_key_format(out, false, 3, "timestamp", "%" PRIi64, sample->time); - output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_); - output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid); + output_json_key_format(out, true, 3, "pid", "%i", thread__pid(al.thread)); + output_json_key_format(out, true, 3, "tid", "%i", thread__tid(al.thread)); if ((sample_type & PERF_SAMPLE_CPU)) output_json_key_format(out, true, 3, "cpu", "%i", sample->cpu); - else if (al.thread->cpu >= 0) - output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu); + else if (thread__cpu(al.thread) >= 0) + output_json_key_format(out, true, 3, "cpu", "%i", thread__cpu(al.thread)); output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread)); diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 84c970c11794..751fd53bfd93 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -64,13 +64,13 @@ int db_export__thread(struct db_export *dbe, struct thread *thread, { u64 main_thread_db_id = 0; - if (thread->db_id) + if (thread__db_id(thread)) return 0; - thread->db_id = ++dbe->thread_last_db_id; + thread__set_db_id(thread, ++dbe->thread_last_db_id); if (main_thread) - main_thread_db_id = main_thread->db_id; + main_thread_db_id = thread__db_id(main_thread); if (dbe->export_thread) return dbe->export_thread(dbe, thread, main_thread_db_id, @@ -251,7 +251,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, */ al.sym = node->ms.sym; al.map = node->ms.map; - al.maps = thread->maps; + al.maps = thread__maps(thread); al.addr = node->ip; if (al.map && !al.sym) @@ -321,7 +321,7 @@ static int db_export__threads(struct db_export *dbe, struct thread *thread, * For a non-main thread, db_export__comm_thread() must be * called only if thread has not previously been exported. */ - bool export_comm_thread = comm && !thread->db_id; + bool export_comm_thread = comm && !thread__db_id(thread); err = db_export__thread(dbe, thread, machine, main_thread); if (err) @@ -529,16 +529,16 @@ static int db_export__pid_tid(struct db_export *dbe, struct machine *machine, struct thread *main_thread; int err = 0; - if (!thread || !thread->comm_set) + if (!thread || !thread__comm_set(thread)) goto out_put; - *is_idle = !thread->pid_ && !thread->tid; + *is_idle = !thread__pid(thread) && !thread__tid(thread); main_thread = thread__main_thread(machine, thread); err = db_export__threads(dbe, thread, main_thread, machine, comm_ptr); - *db_id = thread->db_id; + *db_id = thread__db_id(thread); thread__put(main_thread); out_put: diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c index 16238f823a5e..8016f21dc0b8 100644 --- a/tools/perf/util/dlfilter.c +++ b/tools/perf/util/dlfilter.c @@ -197,8 +197,8 @@ static const __u8 *dlfilter__insn(void *ctx, __u32 *len) if (!al->thread && machine__resolve(d->machine, al, d->sample) < 0) return NULL; - if (al->thread->maps) { - struct machine *machine = maps__machine(al->thread->maps); + if (thread__maps(al->thread)) { + struct machine *machine = maps__machine(thread__maps(al->thread)); if (machine) script_fetch_insn(d->sample, al->thread, machine); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index e8b0666d913c..e1ce7cb5e421 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -573,7 +573,7 @@ int perf_event__process(struct perf_tool *tool __maybe_unused, struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, struct addr_location *al) { - struct maps *maps = thread->maps; + struct maps *maps = thread__maps(thread); struct machine *machine = maps__machine(maps); bool load_map = false; @@ -639,7 +639,7 @@ struct map *thread__find_map_fb(struct thread *thread, u8 cpumode, u64 addr, struct addr_location *al) { struct map *map = thread__find_map(thread, cpumode, addr, al); - struct machine *machine = maps__machine(thread->maps); + struct machine *machine = maps__machine(thread__maps(thread)); u8 addr_cpumode = machine__addr_cpumode(machine, cpumode, addr); if (map || addr_cpumode == cpumode) @@ -696,7 +696,7 @@ int machine__resolve(struct machine *machine, struct addr_location *al, if (thread == NULL) return -1; - dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid); + dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread__tid(thread)); thread__find_map(thread, sample->cpumode, sample->ip, al); dso = al->map ? map__dso(al->map) : NULL; dump_printf(" ...... dso: %s\n", diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 3c9301a26dfc..4bc3affbe891 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2778,12 +2778,12 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh if (hists__has(hists, thread)) { printed += scnprintf(bf + printed, size - printed, ", Thread: %s(%d)", - (thread->comm_set ? thread__comm_str(thread) : ""), - thread->tid); + (thread__comm_set(thread) ? thread__comm_str(thread) : ""), + thread__tid(thread)); } else { printed += scnprintf(bf + printed, size - printed, ", Thread: %s", - (thread->comm_set ? thread__comm_str(thread) : "")); + (thread__comm_set(thread) ? thread__comm_str(thread) : "")); } } if (dso) diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 2c8147a62203..ec1b3bd9f530 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -456,7 +456,7 @@ static int intel_bts_process_queue(struct intel_bts_queue *btsq, u64 *timestamp) thread = machine__find_thread(btsq->bts->machine, -1, btsq->tid); if (thread) - btsq->pid = thread->pid_; + btsq->pid = thread__pid(thread); } else { thread = machine__findnew_thread(btsq->bts->machine, btsq->pid, btsq->tid); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index dde2ca77a005..45c7e7722916 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1428,13 +1428,13 @@ static int intel_pt_get_guest_from_sideband(struct intel_pt_queue *ptq) ptq->guest_machine = machine; } - vcpu = ptq->thread ? ptq->thread->guest_cpu : -1; + vcpu = ptq->thread ? thread__guest_cpu(ptq->thread) : -1; if (vcpu < 0) return -1; tid = machine__get_current_tid(machine, vcpu); - if (ptq->guest_thread && ptq->guest_thread->tid != tid) + if (ptq->guest_thread && thread__tid(ptq->guest_thread) != tid) thread__zput(ptq->guest_thread); if (!ptq->guest_thread) { @@ -1444,7 +1444,7 @@ static int intel_pt_get_guest_from_sideband(struct intel_pt_queue *ptq) } ptq->guest_machine_pid = machine_pid; - ptq->guest_pid = ptq->guest_thread->pid_; + ptq->guest_pid = thread__pid(ptq->guest_thread); ptq->guest_tid = tid; ptq->vcpu = vcpu; @@ -1467,9 +1467,9 @@ static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt, ptq->thread = machine__find_thread(pt->machine, -1, ptq->tid); if (ptq->thread) { - ptq->pid = ptq->thread->pid_; + ptq->pid = thread__pid(ptq->thread); if (queue->cpu == -1) - ptq->cpu = ptq->thread->cpu; + ptq->cpu = thread__cpu(ptq->thread); } if (pt->have_guest_sideband && intel_pt_get_guest_from_sideband(ptq)) { @@ -3074,7 +3074,7 @@ static void intel_pt_sample_set_pid_tid_cpu(struct intel_pt_queue *ptq, if (ptq->pid == -1) { ptq->thread = machine__find_thread(m, -1, ptq->tid); if (ptq->thread) - ptq->pid = ptq->thread->pid_; + ptq->pid = thread__pid(ptq->thread); return; } diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 28e49502db5e..2380b41a4caa 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -799,17 +799,19 @@ static void jit_add_pid(struct machine *machine, pid_t pid) return; } - thread->priv = (void *)1; + thread__set_priv(thread, (void *)true); } static bool jit_has_pid(struct machine *machine, pid_t pid) { struct thread *thread = machine__find_thread(machine, pid, pid); + void *priv; if (!thread) - return 0; + return false; - return (bool)thread->priv; + priv = thread__priv(thread); + return (bool)priv; } int @@ -833,7 +835,7 @@ jit_process(struct perf_session *session, return 0; } - nsi = nsinfo__get(thread->nsinfo); + nsi = nsinfo__get(thread__nsinfo(thread)); thread__put(thread); /* diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index cbf092e32ee9..5d34d60a0045 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -77,13 +77,14 @@ static int thread_rb_node__cmp_tid(const void *key, const struct rb_node *nd) { int to_find = (int) *((pid_t *)key); - return to_find - (int)rb_entry(nd, struct thread_rb_node, rb_node)->thread->tid; + return to_find - (int)thread__tid(rb_entry(nd, struct thread_rb_node, rb_node)->thread); } static struct thread_rb_node *thread_rb_node__find(const struct thread *th, struct rb_root *tree) { - struct rb_node *nd = rb_find(&th->tid, tree, thread_rb_node__cmp_tid); + pid_t to_find = thread__tid(th); + struct rb_node *nd = rb_find(&to_find, tree, thread_rb_node__cmp_tid); return rb_entry(nd, struct thread_rb_node, rb_node); } @@ -440,7 +441,7 @@ static struct thread *findnew_guest_code(struct machine *machine, return NULL; /* Assume maps are set up if there are any */ - if (maps__nr_maps(thread->maps)) + if (maps__nr_maps(thread__maps(thread))) return thread; host_thread = machine__find_thread(host_machine, -1, pid); @@ -453,7 +454,7 @@ static struct thread *findnew_guest_code(struct machine *machine, * Guest code can be found in hypervisor process at the same address * so copy host maps. */ - err = maps__clone(thread, host_thread->maps); + err = maps__clone(thread, thread__maps(host_thread)); thread__put(host_thread); if (err) goto out_err; @@ -518,45 +519,45 @@ static void machine__update_thread_pid(struct machine *machine, { struct thread *leader; - if (pid == th->pid_ || pid == -1 || th->pid_ != -1) + if (pid == thread__pid(th) || pid == -1 || thread__pid(th) != -1) return; - th->pid_ = pid; + thread__set_pid(th, pid); - if (th->pid_ == th->tid) + if (thread__pid(th) == thread__tid(th)) return; - leader = __machine__findnew_thread(machine, th->pid_, th->pid_); + leader = __machine__findnew_thread(machine, thread__pid(th), thread__pid(th)); if (!leader) goto out_err; - if (!leader->maps) - leader->maps = maps__new(machine); + if (!thread__maps(leader)) + thread__set_maps(leader, maps__new(machine)); - if (!leader->maps) + if (!thread__maps(leader)) goto out_err; - if (th->maps == leader->maps) + if (thread__maps(th) == thread__maps(leader)) return; - if (th->maps) { + if (thread__maps(th)) { /* * Maps are created from MMAP events which provide the pid and * tid. Consequently there never should be any maps on a thread * with an unknown pid. Just print an error if there are. */ - if (!maps__empty(th->maps)) + if (!maps__empty(thread__maps(th))) pr_err("Discarding thread maps for %d:%d\n", - th->pid_, th->tid); - maps__put(th->maps); + thread__pid(th), thread__tid(th)); + maps__put(thread__maps(th)); } - th->maps = maps__get(leader->maps); + thread__set_maps(th, maps__get(thread__maps(leader))); out_put: thread__put(leader); return; out_err: - pr_err("Failed to join map groups for %d:%d\n", th->pid_, th->tid); + pr_err("Failed to join map groups for %d:%d\n", thread__pid(th), thread__tid(th)); goto out_put; } @@ -573,7 +574,7 @@ __threads__get_last_match(struct threads *threads, struct machine *machine, th = threads->last_match; if (th != NULL) { - if (th->tid == tid) { + if (thread__tid(th) == tid) { machine__update_thread_pid(machine, th, pid); return thread__get(th); } @@ -632,13 +633,13 @@ static struct thread *____machine__findnew_thread(struct machine *machine, parent = *p; th = rb_entry(parent, struct thread_rb_node, rb_node)->thread; - if (th->tid == tid) { + if (thread__tid(th) == tid) { threads__set_last_match(threads, th); machine__update_thread_pid(machine, th, pid); return thread__get(th); } - if (tid < th->tid) + if (tid < thread__tid(th)) p = &(*p)->rb_left; else { p = &(*p)->rb_right; @@ -2049,7 +2050,7 @@ out_problem: static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd, struct thread *th, bool lock) { - struct threads *threads = machine__threads(machine, th->tid); + struct threads *threads = machine__threads(machine, thread__tid(th)); if (!nd) nd = thread_rb_node__find(th, &threads->entries.rb_root); @@ -2060,7 +2061,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread_rb_n if (lock) down_write(&threads->lock); - BUG_ON(refcount_read(&th->refcnt) == 0); + BUG_ON(refcount_read(thread__refcnt(th)) == 0); thread__put(nd->thread); rb_erase_cached(&nd->rb_node, &threads->entries); @@ -2099,9 +2100,9 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event * (fork) event that would have removed the thread was lost. Assume the * latter case and continue on as best we can. */ - if (parent->pid_ != (pid_t)event->fork.ppid) { + if (thread__pid(parent) != (pid_t)event->fork.ppid) { dump_printf("removing erroneous parent thread %d/%d\n", - parent->pid_, parent->tid); + thread__pid(parent), thread__tid(parent)); machine__remove_thread(machine, parent); thread__put(parent); parent = machine__findnew_thread(machine, event->fork.ppid, @@ -2511,7 +2512,7 @@ static void save_lbr_cursor_node(struct thread *thread, struct callchain_cursor *cursor, int idx) { - struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct lbr_stitch *lbr_stitch = thread__lbr_stitch(thread); if (!lbr_stitch) return; @@ -2553,7 +2554,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, * in callchain_cursor_commit() when the writing session is closed. * Using curr and pos to track the current cursor node. */ - if (thread->lbr_stitch) { + if (thread__lbr_stitch(thread)) { cursor->curr = NULL; cursor->pos = cursor->nr; if (cursor->nr) { @@ -2581,7 +2582,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, * But does not need to save current cursor node for entry 0. * It's impossible to stitch the whole LBRs of previous sample. */ - if (thread->lbr_stitch && (cursor->pos != cursor->nr)) { + if (thread__lbr_stitch(thread) && (cursor->pos != cursor->nr)) { if (!cursor->curr) cursor->curr = cursor->first; else @@ -2634,7 +2635,7 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread, static int lbr_callchain_add_stitched_lbr_ip(struct thread *thread, struct callchain_cursor *cursor) { - struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct lbr_stitch *lbr_stitch = thread__lbr_stitch(thread); struct callchain_cursor_node *cnode; struct stitch_list *stitch_node; int err; @@ -2658,7 +2659,7 @@ static int lbr_callchain_add_stitched_lbr_ip(struct thread *thread, static struct stitch_list *get_stitch_node(struct thread *thread) { - struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct lbr_stitch *lbr_stitch = thread__lbr_stitch(thread); struct stitch_list *stitch_node; if (!list_empty(&lbr_stitch->free_lists)) { @@ -2682,7 +2683,7 @@ static bool has_stitched_lbr(struct thread *thread, struct branch_entry *cur_entries = perf_sample__branch_entries(cur); struct branch_stack *prev_stack = prev->branch_stack; struct branch_entry *prev_entries = perf_sample__branch_entries(prev); - struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct lbr_stitch *lbr_stitch = thread__lbr_stitch(thread); int i, j, nr_identical_branches = 0; struct stitch_list *stitch_node; u64 cur_base, distance; @@ -2746,27 +2747,29 @@ static bool has_stitched_lbr(struct thread *thread, static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr) { - if (thread->lbr_stitch) + if (thread__lbr_stitch(thread)) return true; - thread->lbr_stitch = zalloc(sizeof(*thread->lbr_stitch)); - if (!thread->lbr_stitch) + thread__set_lbr_stitch(thread, zalloc(sizeof(struct lbr_stitch))); + if (!thread__lbr_stitch(thread)) goto err; - thread->lbr_stitch->prev_lbr_cursor = calloc(max_lbr + 1, sizeof(struct callchain_cursor_node)); - if (!thread->lbr_stitch->prev_lbr_cursor) + thread__lbr_stitch(thread)->prev_lbr_cursor = + calloc(max_lbr + 1, sizeof(struct callchain_cursor_node)); + if (!thread__lbr_stitch(thread)->prev_lbr_cursor) goto free_lbr_stitch; - INIT_LIST_HEAD(&thread->lbr_stitch->lists); - INIT_LIST_HEAD(&thread->lbr_stitch->free_lists); + INIT_LIST_HEAD(&thread__lbr_stitch(thread)->lists); + INIT_LIST_HEAD(&thread__lbr_stitch(thread)->free_lists); return true; free_lbr_stitch: - zfree(&thread->lbr_stitch); + free(thread__lbr_stitch(thread)); + thread__set_lbr_stitch(thread, NULL); err: pr_warning("Failed to allocate space for stitched LBRs. Disable LBR stitch\n"); - thread->lbr_stitch_enable = false; + thread__set_lbr_stitch_enable(thread, false); return false; } @@ -2802,9 +2805,9 @@ static int resolve_lbr_callchain_sample(struct thread *thread, if (i == chain_nr) return 0; - if (thread->lbr_stitch_enable && !sample->no_hw_idx && + if (thread__lbr_stitch_enable(thread) && !sample->no_hw_idx && (max_lbr > 0) && alloc_lbr_stitch(thread, max_lbr)) { - lbr_stitch = thread->lbr_stitch; + lbr_stitch = thread__lbr_stitch(thread); stitched_lbr = has_stitched_lbr(thread, sample, &lbr_stitch->prev_sample, @@ -2884,7 +2887,7 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread, static u64 get_leaf_frame_caller(struct perf_sample *sample, struct thread *thread, int usr_idx) { - if (machine__normalized_is(maps__machine(thread->maps), "arm64")) + if (machine__normalized_is(maps__machine(thread__maps(thread)), "arm64")) return get_leaf_frame_caller_aarch64(sample, thread, usr_idx); else return 0; @@ -3265,7 +3268,7 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid, if (!thread) return -ENOMEM; - thread->cpu = cpu; + thread__set_cpu(thread, cpu); thread__put(thread); return 0; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 4d9944bbf5e4..ae1d54d4880a 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -137,7 +137,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, no_dso = is_no_dso_memory(filename); map->prot = prot; map->flags = flags; - nsi = nsinfo__get(thread->nsinfo); + nsi = nsinfo__get(thread__nsinfo(thread)); if ((anon || no_dso) && nsi && (prot & PROT_EXEC)) { snprintf(newfilename, sizeof(newfilename), diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 1aeb1db58fe5..5ae6379a1b42 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -384,7 +384,7 @@ put_map: */ int maps__clone(struct thread *thread, struct maps *parent) { - struct maps *maps = thread->maps; + struct maps *maps = thread__maps(thread); int err; struct map_rb_node *rb_node; diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 40964078f92f..f3d262e871ac 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1163,11 +1163,11 @@ static int python_export_thread(struct db_export *dbe, struct thread *thread, t = tuple_new(5); - tuple_set_d64(t, 0, thread->db_id); + tuple_set_d64(t, 0, thread__db_id(thread)); tuple_set_d64(t, 1, machine->db_id); tuple_set_d64(t, 2, main_thread_db_id); - tuple_set_s32(t, 3, thread->pid_); - tuple_set_s32(t, 4, thread->tid); + tuple_set_s32(t, 3, thread__pid(thread)); + tuple_set_s32(t, 4, thread__tid(thread)); call_object(tables->thread_handler, t, "thread_table"); @@ -1186,7 +1186,7 @@ static int python_export_comm(struct db_export *dbe, struct comm *comm, tuple_set_d64(t, 0, comm->db_id); tuple_set_string(t, 1, comm__str(comm)); - tuple_set_d64(t, 2, thread->db_id); + tuple_set_d64(t, 2, thread__db_id(thread)); tuple_set_d64(t, 3, comm->start); tuple_set_s32(t, 4, comm->exec); @@ -1207,7 +1207,7 @@ static int python_export_comm_thread(struct db_export *dbe, u64 db_id, tuple_set_d64(t, 0, db_id); tuple_set_d64(t, 1, comm->db_id); - tuple_set_d64(t, 2, thread->db_id); + tuple_set_d64(t, 2, thread__db_id(thread)); call_object(tables->comm_thread_handler, t, "comm_thread_table"); @@ -1292,7 +1292,7 @@ static void python_export_sample_table(struct db_export *dbe, tuple_set_d64(t, 0, es->db_id); tuple_set_d64(t, 1, es->evsel->db_id); tuple_set_d64(t, 2, maps__machine(es->al->maps)->db_id); - tuple_set_d64(t, 3, es->al->thread->db_id); + tuple_set_d64(t, 3, thread__db_id(es->al->thread)); tuple_set_d64(t, 4, es->comm_db_id); tuple_set_d64(t, 5, es->dso_db_id); tuple_set_d64(t, 6, es->sym_db_id); @@ -1382,7 +1382,7 @@ static int python_export_call_return(struct db_export *dbe, t = tuple_new(14); tuple_set_d64(t, 0, cr->db_id); - tuple_set_d64(t, 1, cr->thread->db_id); + tuple_set_d64(t, 1, thread__db_id(cr->thread)); tuple_set_d64(t, 2, comm_db_id); tuple_set_d64(t, 3, cr->cp->db_id); tuple_set_d64(t, 4, cr->call_time); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index e2806791c76a..65ac9f7fdf7e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2807,7 +2807,7 @@ static int perf_session__set_guest_cpu(struct perf_session *session, pid_t pid, if (!thread) return -ENOMEM; - thread->guest_cpu = guest_cpu; + thread__set_guest_cpu(thread, guest_cpu); thread__put(thread); return 0; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 650cd8df4041..5e45c770f91d 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -108,7 +108,7 @@ static int64_t cmp_null(const void *l, const void *r) static int64_t sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) { - return right->thread->tid - left->thread->tid; + return thread__tid(right->thread) - thread__tid(left->thread); } static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf, @@ -117,7 +117,7 @@ static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf, const char *comm = thread__comm_str(he->thread); width = max(7U, width) - 8; - return repsep_snprintf(bf, size, "%7d:%-*.*s", he->thread->tid, + return repsep_snprintf(bf, size, "%7d:%-*.*s", thread__tid(he->thread), width, width, comm ?: ""); } @@ -1543,8 +1543,10 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) !l_dso->id.ino && !l_dso->id.ino_generation) { /* userspace anonymous */ - if (left->thread->pid_ > right->thread->pid_) return -1; - if (left->thread->pid_ < right->thread->pid_) return 1; + if (thread__pid(left->thread) > thread__pid(right->thread)) + return -1; + if (thread__pid(left->thread) < thread__pid(right->thread)) + return 1; } addr: diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 4b85c1728012..374d142e7390 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -112,7 +112,7 @@ struct thread_stack { */ static inline bool thread_stack__per_cpu(struct thread *thread) { - return !(thread->tid || thread->pid_); + return !(thread__tid(thread) || thread__pid(thread)); } static int thread_stack__grow(struct thread_stack *ts) @@ -155,8 +155,8 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread, ts->br_stack_sz = br_stack_sz; } - if (thread->maps && maps__machine(thread->maps)) { - struct machine *machine = maps__machine(thread->maps); + if (thread__maps(thread) && maps__machine(thread__maps(thread))) { + struct machine *machine = maps__machine(thread__maps(thread)); const char *arch = perf_env__arch(machine->env); ts->kernel_start = machine__kernel_start(machine); @@ -175,7 +175,7 @@ static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, bool callstack, unsigned int br_stack_sz) { - struct thread_stack *ts = thread->ts, *new_ts; + struct thread_stack *ts = thread__ts(thread), *new_ts; unsigned int old_sz = ts ? ts->arr_sz : 0; unsigned int new_sz = 1; @@ -189,8 +189,8 @@ static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, if (ts) memcpy(new_ts, ts, old_sz * sizeof(*ts)); new_ts->arr_sz = new_sz; - zfree(&thread->ts); - thread->ts = new_ts; + free(thread__ts(thread)); + thread__set_ts(thread, new_ts); ts = new_ts; } @@ -207,7 +207,7 @@ static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu) { - struct thread_stack *ts = thread->ts; + struct thread_stack *ts = thread__ts(thread); if (cpu < 0) cpu = 0; @@ -232,7 +232,7 @@ static inline struct thread_stack *thread__stack(struct thread *thread, if (thread_stack__per_cpu(thread)) return thread__cpu_stack(thread, cpu); - return thread->ts; + return thread__ts(thread); } static int thread_stack__push(struct thread_stack *ts, u64 ret_addr, @@ -363,7 +363,7 @@ static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) int thread_stack__flush(struct thread *thread) { - struct thread_stack *ts = thread->ts; + struct thread_stack *ts = thread__ts(thread); unsigned int pos; int err = 0; @@ -502,13 +502,14 @@ static void thread_stack__reset(struct thread *thread, struct thread_stack *ts) void thread_stack__free(struct thread *thread) { - struct thread_stack *ts = thread->ts; + struct thread_stack *ts = thread__ts(thread); unsigned int pos; if (ts) { for (pos = 0; pos < ts->arr_sz; pos++) __thread_stack__free(thread, ts + pos); - zfree(&thread->ts); + free(thread__ts(thread)); + thread__set_ts(thread, NULL); } } @@ -1127,7 +1128,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm, ts->rstate = X86_RETPOLINE_POSSIBLE; /* Flush stack on exec */ - if (ts->comm != comm && thread->pid_ == thread->tid) { + if (ts->comm != comm && thread__pid(thread) == thread__tid(thread)) { err = __thread_stack__flush(thread, ts); if (err) return err; diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 38d300e3e4d3..9a1db3be6436 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -21,19 +21,20 @@ int thread__init_maps(struct thread *thread, struct machine *machine) { - pid_t pid = thread->pid_; + pid_t pid = thread__pid(thread); - if (pid == thread->tid || pid == -1) { - thread->maps = maps__new(machine); + if (pid == thread__tid(thread) || pid == -1) { + thread__set_maps(thread, maps__new(machine)); } else { struct thread *leader = __machine__findnew_thread(machine, pid, pid); + if (leader) { - thread->maps = maps__get(leader->maps); + thread__set_maps(thread, maps__get(thread__maps(leader))); thread__put(leader); } } - return thread->maps ? 0 : -1; + return thread__maps(thread) ? 0 : -1; } struct thread *thread__new(pid_t pid, pid_t tid) @@ -43,16 +44,16 @@ struct thread *thread__new(pid_t pid, pid_t tid) struct thread *thread = zalloc(sizeof(*thread)); if (thread != NULL) { - thread->pid_ = pid; - thread->tid = tid; - thread->ppid = -1; - thread->cpu = -1; - thread->guest_cpu = -1; - thread->lbr_stitch_enable = false; - INIT_LIST_HEAD(&thread->namespaces_list); - INIT_LIST_HEAD(&thread->comm_list); - init_rwsem(&thread->namespaces_lock); - init_rwsem(&thread->comm_lock); + thread__set_pid(thread, pid); + thread__set_tid(thread, tid); + thread__set_ppid(thread, -1); + thread__set_cpu(thread, -1); + thread__set_guest_cpu(thread, -1); + thread__set_lbr_stitch_enable(thread, false); + INIT_LIST_HEAD(thread__namespaces_list(thread)); + INIT_LIST_HEAD(thread__comm_list(thread)); + init_rwsem(thread__namespaces_lock(thread)); + init_rwsem(thread__comm_lock(thread)); comm_str = malloc(32); if (!comm_str) @@ -64,11 +65,11 @@ struct thread *thread__new(pid_t pid, pid_t tid) if (!comm) goto err_thread; - list_add(&comm->list, &thread->comm_list); - refcount_set(&thread->refcnt, 1); + list_add(&comm->list, thread__comm_list(thread)); + refcount_set(thread__refcnt(thread), 1); /* Thread holds first ref to nsdata. */ thread->nsinfo = nsinfo__new(pid); - srccode_state_init(&thread->srccode_state); + srccode_state_init(thread__srccode_state(thread)); } return thread; @@ -85,30 +86,30 @@ void thread__delete(struct thread *thread) thread_stack__free(thread); - if (thread->maps) { - maps__put(thread->maps); - thread->maps = NULL; + if (thread__maps(thread)) { + maps__put(thread__maps(thread)); + thread__set_maps(thread, NULL); } - down_write(&thread->namespaces_lock); + down_write(thread__namespaces_lock(thread)); list_for_each_entry_safe(namespaces, tmp_namespaces, - &thread->namespaces_list, list) { + thread__namespaces_list(thread), list) { list_del_init(&namespaces->list); namespaces__free(namespaces); } - up_write(&thread->namespaces_lock); + up_write(thread__namespaces_lock(thread)); - down_write(&thread->comm_lock); - list_for_each_entry_safe(comm, tmp_comm, &thread->comm_list, list) { + down_write(thread__comm_lock(thread)); + list_for_each_entry_safe(comm, tmp_comm, thread__comm_list(thread), list) { list_del_init(&comm->list); comm__free(comm); } - up_write(&thread->comm_lock); + up_write(thread__comm_lock(thread)); nsinfo__zput(thread->nsinfo); - srccode_state_free(&thread->srccode_state); + srccode_state_free(thread__srccode_state(thread)); - exit_rwsem(&thread->namespaces_lock); - exit_rwsem(&thread->comm_lock); + exit_rwsem(thread__namespaces_lock(thread)); + exit_rwsem(thread__comm_lock(thread)); thread__free_stitch_list(thread); free(thread); } @@ -116,31 +117,31 @@ void thread__delete(struct thread *thread) struct thread *thread__get(struct thread *thread) { if (thread) - refcount_inc(&thread->refcnt); + refcount_inc(thread__refcnt(thread)); return thread; } void thread__put(struct thread *thread) { - if (thread && refcount_dec_and_test(&thread->refcnt)) + if (thread && refcount_dec_and_test(thread__refcnt(thread))) thread__delete(thread); } -static struct namespaces *__thread__namespaces(const struct thread *thread) +static struct namespaces *__thread__namespaces(struct thread *thread) { - if (list_empty(&thread->namespaces_list)) + if (list_empty(thread__namespaces_list(thread))) return NULL; - return list_first_entry(&thread->namespaces_list, struct namespaces, list); + return list_first_entry(thread__namespaces_list(thread), struct namespaces, list); } struct namespaces *thread__namespaces(struct thread *thread) { struct namespaces *ns; - down_read(&thread->namespaces_lock); + down_read(thread__namespaces_lock(thread)); ns = __thread__namespaces(thread); - up_read(&thread->namespaces_lock); + up_read(thread__namespaces_lock(thread)); return ns; } @@ -154,7 +155,7 @@ static int __thread__set_namespaces(struct thread *thread, u64 timestamp, if (!new) return -ENOMEM; - list_add(&new->list, &thread->namespaces_list); + list_add(&new->list, thread__namespaces_list(thread)); if (timestamp && curr) { /* @@ -174,25 +175,25 @@ int thread__set_namespaces(struct thread *thread, u64 timestamp, { int ret; - down_write(&thread->namespaces_lock); + down_write(thread__namespaces_lock(thread)); ret = __thread__set_namespaces(thread, timestamp, event); - up_write(&thread->namespaces_lock); + up_write(thread__namespaces_lock(thread)); return ret; } -struct comm *thread__comm(const struct thread *thread) +struct comm *thread__comm(struct thread *thread) { - if (list_empty(&thread->comm_list)) + if (list_empty(thread__comm_list(thread))) return NULL; - return list_first_entry(&thread->comm_list, struct comm, list); + return list_first_entry(thread__comm_list(thread), struct comm, list); } -struct comm *thread__exec_comm(const struct thread *thread) +struct comm *thread__exec_comm(struct thread *thread) { struct comm *comm, *last = NULL, *second_last = NULL; - list_for_each_entry(comm, &thread->comm_list, list) { + list_for_each_entry(comm, thread__comm_list(thread), list) { if (comm->exec) return comm; second_last = last; @@ -205,7 +206,7 @@ struct comm *thread__exec_comm(const struct thread *thread) * thread, that is very probably wrong. Prefer a later comm to avoid * that case. */ - if (second_last && !last->start && thread->pid_ == thread->tid) + if (second_last && !last->start && thread__pid(thread) == thread__tid(thread)) return second_last; return last; @@ -217,7 +218,7 @@ static int ____thread__set_comm(struct thread *thread, const char *str, struct comm *new, *curr = thread__comm(thread); /* Override the default :tid entry */ - if (!thread->comm_set) { + if (!thread__comm_set(thread)) { int err = comm__override(curr, str, timestamp, exec); if (err) return err; @@ -225,13 +226,13 @@ static int ____thread__set_comm(struct thread *thread, const char *str, new = comm__new(str, timestamp, exec); if (!new) return -ENOMEM; - list_add(&new->list, &thread->comm_list); + list_add(&new->list, thread__comm_list(thread)); if (exec) - unwind__flush_access(thread->maps); + unwind__flush_access(thread__maps(thread)); } - thread->comm_set = true; + thread__set_comm_set(thread, true); return 0; } @@ -241,9 +242,9 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp, { int ret; - down_write(&thread->comm_lock); + down_write(thread__comm_lock(thread)); ret = ____thread__set_comm(thread, str, timestamp, exec); - up_write(&thread->comm_lock); + up_write(thread__comm_lock(thread)); return ret; } @@ -255,7 +256,7 @@ int thread__set_comm_from_proc(struct thread *thread) int err = -1; if (!(snprintf(path, sizeof(path), "%d/task/%d/comm", - thread->pid_, thread->tid) >= (int)sizeof(path)) && + thread__pid(thread), thread__tid(thread)) >= (int)sizeof(path)) && procfs__read_str(path, &comm, &sz) == 0) { comm[sz - 1] = '\0'; err = thread__set_comm(thread, comm, 0); @@ -264,7 +265,7 @@ int thread__set_comm_from_proc(struct thread *thread) return err; } -static const char *__thread__comm_str(const struct thread *thread) +static const char *__thread__comm_str(struct thread *thread) { const struct comm *comm = thread__comm(thread); @@ -278,9 +279,9 @@ const char *thread__comm_str(struct thread *thread) { const char *str; - down_read(&thread->comm_lock); + down_read(thread__comm_lock(thread)); str = __thread__comm_str(thread); - up_read(&thread->comm_lock); + up_read(thread__comm_lock(thread)); return str; } @@ -289,23 +290,23 @@ static int __thread__comm_len(struct thread *thread, const char *comm) { if (!comm) return 0; - thread->comm_len = strlen(comm); + thread__set_comm_len(thread, strlen(comm)); - return thread->comm_len; + return thread__var_comm_len(thread); } /* CHECKME: it should probably better return the max comm len from its comm list */ int thread__comm_len(struct thread *thread) { - int comm_len = thread->comm_len; + int comm_len = thread__var_comm_len(thread); if (!comm_len) { const char *comm; - down_read(&thread->comm_lock); + down_read(thread__comm_lock(thread)); comm = __thread__comm_str(thread); comm_len = __thread__comm_len(thread, comm); - up_read(&thread->comm_lock); + up_read(thread__comm_lock(thread)); } return comm_len; @@ -313,33 +314,33 @@ int thread__comm_len(struct thread *thread) size_t thread__fprintf(struct thread *thread, FILE *fp) { - return fprintf(fp, "Thread %d %s\n", thread->tid, thread__comm_str(thread)) + - maps__fprintf(thread->maps, fp); + return fprintf(fp, "Thread %d %s\n", thread__tid(thread), thread__comm_str(thread)) + + maps__fprintf(thread__maps(thread), fp); } int thread__insert_map(struct thread *thread, struct map *map) { int ret; - ret = unwind__prepare_access(thread->maps, map, NULL); + ret = unwind__prepare_access(thread__maps(thread), map, NULL); if (ret) return ret; - maps__fixup_overlappings(thread->maps, map, stderr); - return maps__insert(thread->maps, map); + maps__fixup_overlappings(thread__maps(thread), map, stderr); + return maps__insert(thread__maps(thread), map); } static int __thread__prepare_access(struct thread *thread) { bool initialized = false; int err = 0; - struct maps *maps = thread->maps; + struct maps *maps = thread__maps(thread); struct map_rb_node *rb_node; down_read(maps__lock(maps)); maps__for_each_entry(maps, rb_node) { - err = unwind__prepare_access(thread->maps, rb_node->map, &initialized); + err = unwind__prepare_access(thread__maps(thread), rb_node->map, &initialized); if (err || initialized) break; } @@ -362,21 +363,22 @@ static int thread__prepare_access(struct thread *thread) static int thread__clone_maps(struct thread *thread, struct thread *parent, bool do_maps_clone) { /* This is new thread, we share map groups for process. */ - if (thread->pid_ == parent->pid_) + if (thread__pid(thread) == thread__pid(parent)) return thread__prepare_access(thread); - if (thread->maps == parent->maps) { + if (thread__maps(thread) == thread__maps(parent)) { pr_debug("broken map groups on thread %d/%d parent %d/%d\n", - thread->pid_, thread->tid, parent->pid_, parent->tid); + thread__pid(thread), thread__tid(thread), + thread__pid(parent), thread__tid(parent)); return 0; } /* But this one is new process, copy maps. */ - return do_maps_clone ? maps__clone(thread, parent->maps) : 0; + return do_maps_clone ? maps__clone(thread, thread__maps(parent)) : 0; } int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone) { - if (parent->comm_set) { + if (thread__comm_set(parent)) { const char *comm = thread__comm_str(parent); int err; if (!comm) @@ -386,7 +388,7 @@ int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bo return err; } - thread->ppid = parent->tid; + thread__set_ppid(thread, thread__tid(parent)); return thread__clone_maps(thread, parent, do_maps_clone); } @@ -410,13 +412,13 @@ void thread__find_cpumode_addr_location(struct thread *thread, u64 addr, struct thread *thread__main_thread(struct machine *machine, struct thread *thread) { - if (thread->pid_ == thread->tid) + if (thread__pid(thread) == thread__tid(thread)) return thread__get(thread); - if (thread->pid_ == -1) + if (thread__pid(thread) == -1) return NULL; - return machine__find_thread(machine, thread->pid_, thread->pid_); + return machine__find_thread(machine, thread__pid(thread), thread__pid(thread)); } int thread__memcpy(struct thread *thread, struct machine *machine, @@ -447,7 +449,7 @@ int thread__memcpy(struct thread *thread, struct machine *machine, void thread__free_stitch_list(struct thread *thread) { - struct lbr_stitch *lbr_stitch = thread->lbr_stitch; + struct lbr_stitch *lbr_stitch = thread__lbr_stitch(thread); struct stitch_list *pos, *tmp; if (!lbr_stitch) @@ -464,5 +466,6 @@ void thread__free_stitch_list(struct thread *thread) } zfree(&lbr_stitch->prev_lbr_cursor); - zfree(&thread->lbr_stitch); + free(thread__lbr_stitch(thread)); + thread__set_lbr_stitch(thread, NULL); } diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 3b3f9fb5a916..b103992c3831 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -96,8 +96,8 @@ static inline int thread__set_comm(struct thread *thread, const char *comm, int thread__set_comm_from_proc(struct thread *thread); int thread__comm_len(struct thread *thread); -struct comm *thread__comm(const struct thread *thread); -struct comm *thread__exec_comm(const struct thread *thread); +struct comm *thread__comm(struct thread *thread); +struct comm *thread__exec_comm(struct thread *thread); const char *thread__comm_str(struct thread *thread); int thread__insert_map(struct thread *thread, struct map *map); int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone); @@ -121,6 +121,126 @@ void thread__find_cpumode_addr_location(struct thread *thread, u64 addr, int thread__memcpy(struct thread *thread, struct machine *machine, void *buf, u64 ip, int len, bool *is64bit); +static inline struct maps *thread__maps(struct thread *thread) +{ + return thread->maps; +} + +static inline void thread__set_maps(struct thread *thread, struct maps *maps) +{ + thread->maps = maps; +} + +static inline pid_t thread__pid(const struct thread *thread) +{ + return thread->pid_; +} + +static inline void thread__set_pid(struct thread *thread, pid_t pid_) +{ + thread->pid_ = pid_; +} + +static inline pid_t thread__tid(const struct thread *thread) +{ + return thread->tid; +} + +static inline void thread__set_tid(struct thread *thread, pid_t tid) +{ + thread->tid = tid; +} + +static inline pid_t thread__ppid(const struct thread *thread) +{ + return thread->ppid; +} + +static inline void thread__set_ppid(struct thread *thread, pid_t ppid) +{ + thread->ppid = ppid; +} + +static inline int thread__cpu(const struct thread *thread) +{ + return thread->cpu; +} + +static inline void thread__set_cpu(struct thread *thread, int cpu) +{ + thread->cpu = cpu; +} + +static inline int thread__guest_cpu(const struct thread *thread) +{ + return thread->guest_cpu; +} + +static inline void thread__set_guest_cpu(struct thread *thread, int guest_cpu) +{ + thread->guest_cpu = guest_cpu; +} + +static inline refcount_t *thread__refcnt(struct thread *thread) +{ + return &thread->refcnt; +} + +static inline bool thread__comm_set(const struct thread *thread) +{ + return thread->comm_set; +} + +static inline void thread__set_comm_set(struct thread *thread, bool set) +{ + thread->comm_set = set; +} + +static inline int thread__var_comm_len(const struct thread *thread) +{ + return thread->comm_len; +} + +static inline void thread__set_comm_len(struct thread *thread, int len) +{ + thread->comm_len = len; +} + +static inline struct list_head *thread__namespaces_list(struct thread *thread) +{ + return &thread->namespaces_list; +} + +static inline int thread__namespaces_list_empty(const struct thread *thread) +{ + return list_empty(&thread->namespaces_list); +} + +static inline struct rw_semaphore *thread__namespaces_lock(struct thread *thread) +{ + return &thread->namespaces_lock; +} + +static inline struct list_head *thread__comm_list(struct thread *thread) +{ + return &thread->comm_list; +} + +static inline struct rw_semaphore *thread__comm_lock(struct thread *thread) +{ + return &thread->comm_lock; +} + +static inline u64 thread__db_id(const struct thread *thread) +{ + return thread->db_id; +} + +static inline void thread__set_db_id(struct thread *thread, u64 db_id) +{ + thread->db_id = db_id; +} + static inline void *thread__priv(struct thread *thread) { return thread->priv; @@ -131,6 +251,66 @@ static inline void thread__set_priv(struct thread *thread, void *p) thread->priv = p; } +static inline struct thread_stack *thread__ts(struct thread *thread) +{ + return thread->ts; +} + +static inline void thread__set_ts(struct thread *thread, struct thread_stack *ts) +{ + thread->ts = ts; +} + +static inline struct nsinfo *thread__nsinfo(struct thread *thread) +{ + return thread->nsinfo; +} + +static inline struct srccode_state *thread__srccode_state(struct thread *thread) +{ + return &thread->srccode_state; +} + +static inline bool thread__filter(const struct thread *thread) +{ + return thread->filter; +} + +static inline void thread__set_filter(struct thread *thread, bool filter) +{ + thread->filter = filter; +} + +static inline int thread__filter_entry_depth(const struct thread *thread) +{ + return thread->filter_entry_depth; +} + +static inline void thread__set_filter_entry_depth(struct thread *thread, int depth) +{ + thread->filter_entry_depth = depth; +} + +static inline bool thread__lbr_stitch_enable(const struct thread *thread) +{ + return thread->lbr_stitch_enable; +} + +static inline void thread__set_lbr_stitch_enable(struct thread *thread, bool en) +{ + thread->lbr_stitch_enable = en; +} + +static inline struct lbr_stitch *thread__lbr_stitch(struct thread *thread) +{ + return thread->lbr_stitch; +} + +static inline void thread__set_lbr_stitch(struct thread *thread, struct lbr_stitch *lbrs) +{ + thread->lbr_stitch = lbrs; +} + static inline bool thread__is_filtered(struct thread *thread) { if (symbol_conf.comm_list && @@ -139,12 +319,12 @@ static inline bool thread__is_filtered(struct thread *thread) } if (symbol_conf.pid_list && - !intlist__has_entry(symbol_conf.pid_list, thread->pid_)) { + !intlist__has_entry(symbol_conf.pid_list, thread__pid(thread))) { return true; } if (symbol_conf.tid_list && - !intlist__has_entry(symbol_conf.tid_list, thread->tid)) { + !intlist__has_entry(symbol_conf.tid_list, thread__tid(thread))) { return true; } diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index bdccfc511b7e..3723b5e31b2a 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -230,7 +230,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, struct unwind_info *ui, ui_buf = { .sample = data, .thread = thread, - .machine = RC_CHK_ACCESS(thread->maps)->machine, + .machine = RC_CHK_ACCESS(thread__maps(thread))->machine, .cb = cb, .arg = arg, .max_stack = max_stack, @@ -260,11 +260,11 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, if (err) goto out; - err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread->tid, &callbacks, ui); + err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), &callbacks, ui); if (err) goto out; - err = dwfl_getthread_frames(ui->dwfl, thread->tid, frame_callback, ui); + err = dwfl_getthread_frames(ui->dwfl, thread__tid(thread), frame_callback, ui); if (err && ui->max_stack != max_stack) err = 0; diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 83dd79dcd597..11f3fc95aa11 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -325,7 +325,7 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct unwind_info *ui, return -EINVAL; } - maps__for_each_entry(ui->thread->maps, map_node) { + maps__for_each_entry(thread__maps(ui->thread), map_node) { struct map *map = map_node->map; u64 start = map__start(map); @@ -719,7 +719,7 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, */ if (max_stack - 1 > 0) { WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL"); - addr_space = maps__addr_space(ui->thread->maps); + addr_space = maps__addr_space(thread__maps(ui->thread)); if (addr_space == NULL) return -1; @@ -769,7 +769,7 @@ static int _unwind__get_entries(unwind_entry_cb_t cb, void *arg, struct unwind_info ui = { .sample = data, .thread = thread, - .machine = maps__machine(thread->maps), + .machine = maps__machine(thread__maps(thread)), .best_effort = best_effort }; diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c index 375d23d9a590..76cd63de80a8 100644 --- a/tools/perf/util/unwind-libunwind.c +++ b/tools/perf/util/unwind-libunwind.c @@ -89,7 +89,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg, struct perf_sample *data, int max_stack, bool best_effort) { - const struct unwind_libunwind_ops *ops = maps__unwind_libunwind_ops(thread->maps); + const struct unwind_libunwind_ops *ops = maps__unwind_libunwind_ops(thread__maps(thread)); if (ops) return ops->get_entries(cb, arg, thread, data, max_stack, best_effort); diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index ec777ee11493..ae3eee69b659 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -146,7 +146,7 @@ static enum dso_type machine__thread_dso_type(struct machine *machine, enum dso_type dso_type = DSO__TYPE_UNKNOWN; struct map_rb_node *rb_node; - maps__for_each_entry(thread->maps, rb_node) { + maps__for_each_entry(thread__maps(thread), rb_node) { struct dso *dso = map__dso(rb_node->map); if (!dso || dso->long_name[0] != '/') -- cgit v1.2.3 From 46125590e0df7602d02602fcb0134a4085aca442 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:01 -0700 Subject: perf maps: Make delete static, always use put Address/leak sanitizer with reference count checking can identify the location of leaks, so use put rather than delete to avoid free-ing memory when the reference count is >1. Add maps__zput to ensure the variable is cleared. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/maps.c | 2 +- tools/perf/util/machine.c | 2 +- tools/perf/util/maps.c | 2 +- tools/perf/util/maps.h | 9 ++++++++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/maps.c b/tools/perf/tests/maps.c index 8c0eb5cf8bb5..5bb1123a91a7 100644 --- a/tools/perf/tests/maps.c +++ b/tools/perf/tests/maps.c @@ -140,7 +140,7 @@ static int test__maps__merge_in(struct test_suite *t __maybe_unused, int subtest ret = check_maps(merged3, ARRAY_SIZE(merged3), maps); TEST_ASSERT_VAL("merge check failed", !ret); - maps__delete(maps); + maps__zput(maps); return TEST_OK; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 5d34d60a0045..8972c852d3bd 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -248,7 +248,7 @@ void machine__exit(struct machine *machine) return; machine__destroy_kernel_maps(machine); - maps__delete(machine->kmaps); + maps__zput(machine->kmaps); dsos__exit(&machine->dsos); machine__exit_vdso(machine); zfree(&machine->root_dir); diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 5ae6379a1b42..5206a6433117 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -171,7 +171,7 @@ struct maps *maps__new(struct machine *machine) return result; } -void maps__delete(struct maps *maps) +static void maps__delete(struct maps *maps) { maps__exit(maps); unwind__finish_access(maps); diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index d2963456cfbe..83144e0645ed 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -57,13 +57,20 @@ struct kmap { }; struct maps *maps__new(struct machine *machine); -void maps__delete(struct maps *maps); bool maps__empty(struct maps *maps); int maps__clone(struct thread *thread, struct maps *parent); struct maps *maps__get(struct maps *maps); void maps__put(struct maps *maps); +static inline void __maps__zput(struct maps **map) +{ + maps__put(*map); + *map = NULL; +} + +#define maps__zput(map) __maps__zput(&map) + static inline struct rb_root *maps__entries(struct maps *maps) { return &RC_CHK_ACCESS(maps)->entries; -- cgit v1.2.3 From 620be847f459fce62f673311d035cd298581b1eb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:02 -0700 Subject: perf addr_location: Move to its own header addr_location is a common abstraction, move it into its own header and source file in preparation for wider clean up. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/addr_location.c | 16 ++++++++++++++++ tools/perf/util/addr_location.h | 28 ++++++++++++++++++++++++++++ tools/perf/util/event.c | 12 ------------ tools/perf/util/symbol.h | 17 +---------------- 5 files changed, 46 insertions(+), 28 deletions(-) create mode 100644 tools/perf/util/addr_location.c create mode 100644 tools/perf/util/addr_location.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index c449741adf30..ff2fd1a36bb8 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -1,4 +1,5 @@ perf-y += arm64-frame-pointer-unwind-support.o +perf-y += addr_location.o perf-y += annotate.o perf-y += block-info.o perf-y += block-range.o diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c new file mode 100644 index 000000000000..c73fc2aa236c --- /dev/null +++ b/tools/perf/util/addr_location.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "addr_location.h" +#include "map.h" +#include "thread.h" + +/* + * The preprocess_sample method will return with reference counts for the + * in it, when done using (and perhaps getting ref counts if needing to + * keep a pointer to one of those entries) it must be paired with + * addr_location__put(), so that the refcounts can be decremented. + */ +void addr_location__put(struct addr_location *al) +{ + map__zput(al->map); + thread__zput(al->thread); +} diff --git a/tools/perf/util/addr_location.h b/tools/perf/util/addr_location.h new file mode 100644 index 000000000000..7dfa7417c0fe --- /dev/null +++ b/tools/perf/util/addr_location.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_ADDR_LOCATION +#define __PERF_ADDR_LOCATION 1 + +#include + +struct thread; +struct maps; +struct map; +struct symbol; + +struct addr_location { + struct thread *thread; + struct maps *maps; + struct map *map; + struct symbol *sym; + const char *srcline; + u64 addr; + char level; + u8 filtered; + u8 cpumode; + s32 cpu; + s32 socket; +}; + +void addr_location__put(struct addr_location *al); + +#endif /* __PERF_ADDR_LOCATION */ diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index e1ce7cb5e421..6ee23145ee7e 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -767,18 +767,6 @@ int machine__resolve(struct machine *machine, struct addr_location *al, return 0; } -/* - * The preprocess_sample method will return with reference counts for the - * in it, when done using (and perhaps getting ref counts if needing to - * keep a pointer to one of those entries) it must be paired with - * addr_location__put(), so that the refcounts can be decremented. - */ -void addr_location__put(struct addr_location *al) -{ - map__zput(al->map); - thread__zput(al->thread); -} - bool is_bts_event(struct perf_event_attr *attr) { return attr->type == PERF_TYPE_HARDWARE && diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 7558735543c2..5ca8665dd2c1 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -9,6 +9,7 @@ #include #include #include +#include "addr_location.h" #include "path.h" #include "symbol_conf.h" #include "spark.h" @@ -120,22 +121,6 @@ struct ref_reloc_sym { u64 unrelocated_addr; }; -struct addr_location { - struct thread *thread; - struct maps *maps; - struct map *map; - struct symbol *sym; - const char *srcline; - u64 addr; - char level; - u8 filtered; - u8 cpumode; - s32 cpu; - s32 socket; -}; - -void addr_location__put(struct addr_location *al); - int dso__load(struct dso *dso, struct map *map); int dso__load_vmlinux(struct dso *dso, struct map *map, const char *vmlinux, bool vmlinux_allocated); -- cgit v1.2.3 From 0dd5041c9a0eaf8c5c3fd46df4ee60f877799f44 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:03 -0700 Subject: perf addr_location: Add init/exit/copy functions struct addr_location holds references to multiple reference counted objects. Add init/exit functions to make maintenance of those more consistent with the rest of the code and to try to avoid leaks. Modification of thread reference counts isn't included in this change. Committer notes: I needed to initialize result to sample->ip to make sure is set to something, fixing a compile time error, mostly keeping the previous logic as build_alloc_func_list() already does debugging/error prints about what went wrong if it takes the 'goto out'. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 28 +++++--- tools/perf/builtin-c2c.c | 12 ++-- tools/perf/builtin-diff.c | 16 +++-- tools/perf/builtin-inject.c | 2 + tools/perf/builtin-kmem.c | 10 ++- tools/perf/builtin-kwork.c | 15 +++-- tools/perf/builtin-mem.c | 4 +- tools/perf/builtin-report.c | 6 +- tools/perf/builtin-sched.c | 2 + tools/perf/builtin-script.c | 77 +++++++++++++--------- tools/perf/builtin-timechart.c | 11 ++-- tools/perf/builtin-top.c | 6 +- tools/perf/builtin-trace.c | 10 ++- tools/perf/tests/code-reading.c | 3 +- tools/perf/tests/hists_cumulate.c | 17 +++-- tools/perf/tests/hists_filter.c | 11 ++-- tools/perf/tests/hists_link.c | 18 +++-- tools/perf/tests/hists_output.c | 10 ++- tools/perf/tests/mmap-thread-lookup.c | 4 +- tools/perf/util/addr_location.c | 30 ++++++++- tools/perf/util/addr_location.h | 5 +- tools/perf/util/build-id.c | 2 + tools/perf/util/cs-etm.c | 20 +++--- tools/perf/util/data-convert-json.c | 8 ++- tools/perf/util/db-export.c | 4 +- tools/perf/util/dlfilter.c | 13 +++- tools/perf/util/event.c | 16 +++-- tools/perf/util/evsel_fprintf.c | 8 ++- tools/perf/util/hist.c | 8 ++- tools/perf/util/intel-pt.c | 66 +++++++++++++------ tools/perf/util/machine.c | 35 +++++----- .../util/scripting-engines/trace-event-python.c | 10 ++- tools/perf/util/thread.c | 13 +++- tools/perf/util/unwind-libdw.c | 21 ++++-- tools/perf/util/unwind-libunwind-local.c | 13 +++- 35 files changed, 368 insertions(+), 166 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 425a7e2fd6fb..aeeb801f1ed7 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -184,7 +184,7 @@ out: static int process_branch_callback(struct evsel *evsel, struct perf_sample *sample, - struct addr_location *al __maybe_unused, + struct addr_location *al, struct perf_annotate *ann, struct machine *machine) { @@ -195,21 +195,29 @@ static int process_branch_callback(struct evsel *evsel, .hide_unresolved = symbol_conf.hide_unresolved, .ops = &hist_iter_branch, }; - struct addr_location a; + int ret; - if (machine__resolve(machine, &a, sample) < 0) - return -1; + addr_location__init(&a); + if (machine__resolve(machine, &a, sample) < 0) { + ret = -1; + goto out; + } - if (a.sym == NULL) - return 0; + if (a.sym == NULL) { + ret = 0; + goto out; + } if (a.map != NULL) map__dso(a.map)->hit = 1; hist__account_cycles(sample->branch_stack, al, sample, false, NULL); - return hist_entry_iter__add(&iter, &a, PERF_MAX_STACK_DEPTH, ann); + ret = hist_entry_iter__add(&iter, &a, PERF_MAX_STACK_DEPTH, ann); +out: + addr_location__exit(&a); + return ret; } static bool has_annotation(struct perf_annotate *ann) @@ -272,10 +280,12 @@ static int process_sample_event(struct perf_tool *tool, struct addr_location al; int ret = 0; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); - return -1; + ret = -1; + goto out_put; } if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap)) @@ -288,7 +298,7 @@ static int process_sample_event(struct perf_tool *tool, ret = -1; } out_put: - addr_location__put(&al); + addr_location__exit(&al); return ret; } diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index ee41a96f0c73..530a44a59f41 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -286,10 +286,12 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, struct mem_info *mi, *mi_dup; int ret; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); - return -1; + ret = -1; + goto out; } if (c2c.stitch_lbr) @@ -301,8 +303,10 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, goto out; mi = sample__resolve_mem(sample, &al); - if (mi == NULL) - return -ENOMEM; + if (mi == NULL) { + ret = -ENOMEM; + goto out; + } /* * The mi object is released in hists__add_entry_ops, @@ -368,7 +372,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, } out: - addr_location__put(&al); + addr_location__exit(&al); return ret; free_mi: diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index dbb0562d6a4f..ca39657ee407 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -409,15 +409,17 @@ static int diff__process_sample_event(struct perf_tool *tool, return 0; } + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); - return -1; + ret = -1; + goto out; } if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) { ret = 0; - goto out_put; + goto out; } switch (compute) { @@ -426,7 +428,7 @@ static int diff__process_sample_event(struct perf_tool *tool, NULL, NULL, NULL, sample, true)) { pr_warning("problem incrementing symbol period, " "skipping event\n"); - goto out_put; + goto out; } hist__account_cycles(sample->branch_stack, &al, sample, false, @@ -437,7 +439,7 @@ static int diff__process_sample_event(struct perf_tool *tool, if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH, NULL)) { pr_debug("problem adding hist entry, skipping event\n"); - goto out_put; + goto out; } break; @@ -446,7 +448,7 @@ static int diff__process_sample_event(struct perf_tool *tool, true)) { pr_warning("problem incrementing symbol period, " "skipping event\n"); - goto out_put; + goto out; } } @@ -460,8 +462,8 @@ static int diff__process_sample_event(struct perf_tool *tool, if (!al.filtered) hists->stats.total_non_filtered_period += sample->period; ret = 0; -out_put: - addr_location__put(&al); +out: + addr_location__exit(&al); return ret; } diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index d9e96d4624c6..d19a1b862306 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -743,6 +743,7 @@ int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, struct addr_location al; struct thread *thread; + addr_location__init(&al); thread = machine__findnew_thread(machine, sample->pid, sample->tid); if (thread == NULL) { pr_err("problem processing %d event, skipping it.\n", @@ -763,6 +764,7 @@ int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, thread__put(thread); repipe: perf_event__repipe(tool, event, sample, machine); + addr_location__exit(&al); return 0; } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index fe9439a4fd66..96a6611e4e53 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -399,7 +399,9 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample) struct addr_location al; struct machine *machine = &kmem_session->machines.host; struct callchain_cursor_node *node; + u64 result = sample->ip; + addr_location__init(&al); if (alloc_func_list == NULL) { if (build_alloc_func_list() < 0) goto out; @@ -427,16 +429,18 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample) else addr = node->ip; - return addr; + result = addr; + goto out; } else pr_debug3("skipping alloc function: %s\n", caller->name); callchain_cursor_advance(&callchain_cursor); } -out: pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip); - return sample->ip; +out: + addr_location__exit(&al); + return result; } struct sort_dimension { diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index a9395c52b23b..2d80aef4eccc 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -739,17 +739,22 @@ static int timehist_exit_event(struct perf_kwork *kwork, struct kwork_atom *atom = NULL; struct kwork_work *work = NULL; struct addr_location al; + int ret = 0; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_debug("Problem processing event, skipping it\n"); - return -1; + ret = -1; + goto out; } atom = work_pop_atom(kwork, class, KWORK_TRACE_EXIT, KWORK_TRACE_ENTRY, evsel, sample, machine, &work); - if (work == NULL) - return -1; + if (work == NULL) { + ret = -1; + goto out; + } if (atom != NULL) { work->nr_atoms++; @@ -757,7 +762,9 @@ static int timehist_exit_event(struct perf_kwork *kwork, atom_del(atom); } - return 0; +out: + addr_location__exit(&al); + return ret; } static struct kwork_class kwork_irq; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 960bfd4b732a..51499c20da01 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -199,9 +199,11 @@ dump_raw_samples(struct perf_tool *tool, char str[PAGE_SIZE_NAME_LEN]; struct dso *dso = NULL; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); + addr_location__exit(&al); return -1; } @@ -256,7 +258,7 @@ dump_raw_samples(struct perf_tool *tool, dso ? dso->long_name : "???", al.sym ? al.sym->name : "???"); out_put: - addr_location__put(&al); + addr_location__exit(&al); return 0; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8ea6ab18534a..0b091a8983a5 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -285,10 +285,12 @@ static int process_sample_event(struct perf_tool *tool, if (evswitch__discard(&rep->evswitch, evsel)) return 0; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); - return -1; + ret = -1; + goto out_put; } if (rep->stitch_lbr) @@ -331,7 +333,7 @@ static int process_sample_event(struct perf_tool *tool, if (ret < 0) pr_debug("problem adding hist entry, skipping event\n"); out_put: - addr_location__put(&al); + addr_location__exit(&al); return ret; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index fd37468c4f62..c75ad82a6729 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2584,6 +2584,7 @@ static int timehist_sched_change_event(struct perf_tool *tool, int rc = 0; int state = evsel__intval(evsel, sample, "prev_state"); + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_err("problem processing %d event. skipping it\n", event->header.type); @@ -2692,6 +2693,7 @@ out: evsel__save_time(evsel, sample->time, sample->cpu); + addr_location__exit(&al); return rc; } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index e756290de2ac..784d478c2e05 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -919,7 +919,6 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, { struct branch_stack *br = sample->branch_stack; struct branch_entry *entries = perf_sample__branch_entries(sample); - struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -930,20 +929,22 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, from = entries[i].from; to = entries[i].to; + printed += fprintf(fp, " 0x%"PRIx64, from); if (PRINT_FIELD(DSO)) { - memset(&alf, 0, sizeof(alf)); - memset(&alt, 0, sizeof(alt)); + struct addr_location alf, alt; + + addr_location__init(&alf); + addr_location__init(&alt); thread__find_map_fb(thread, sample->cpumode, from, &alf); thread__find_map_fb(thread, sample->cpumode, to, &alt); - } - printed += fprintf(fp, " 0x%"PRIx64, from); - if (PRINT_FIELD(DSO)) printed += map__fprintf_dsoname_dsoff(alf.map, PRINT_FIELD(DSOFF), alf.addr, fp); - - printed += fprintf(fp, "/0x%"PRIx64, to); - if (PRINT_FIELD(DSO)) + printed += fprintf(fp, "/0x%"PRIx64, to); printed += map__fprintf_dsoname_dsoff(alt.map, PRINT_FIELD(DSOFF), alt.addr, fp); + addr_location__exit(&alt); + addr_location__exit(&alf); + } else + printed += fprintf(fp, "/0x%"PRIx64, to); printed += print_bstack_flags(fp, entries + i); } @@ -957,7 +958,6 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, { struct branch_stack *br = sample->branch_stack; struct branch_entry *entries = perf_sample__branch_entries(sample); - struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -965,9 +965,10 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, return 0; for (i = 0; i < br->nr; i++) { + struct addr_location alf, alt; - memset(&alf, 0, sizeof(alf)); - memset(&alt, 0, sizeof(alt)); + addr_location__init(&alf); + addr_location__init(&alt); from = entries[i].from; to = entries[i].to; @@ -982,6 +983,8 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, if (PRINT_FIELD(DSO)) printed += map__fprintf_dsoname_dsoff(alt.map, PRINT_FIELD(DSOFF), alt.addr, fp); printed += print_bstack_flags(fp, entries + i); + addr_location__exit(&alt); + addr_location__exit(&alf); } return printed; @@ -993,7 +996,6 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, { struct branch_stack *br = sample->branch_stack; struct branch_entry *entries = perf_sample__branch_entries(sample); - struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -1001,9 +1003,10 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, return 0; for (i = 0; i < br->nr; i++) { + struct addr_location alf, alt; - memset(&alf, 0, sizeof(alf)); - memset(&alt, 0, sizeof(alt)); + addr_location__init(&alf); + addr_location__init(&alt); from = entries[i].from; to = entries[i].to; @@ -1022,6 +1025,8 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, if (PRINT_FIELD(DSO)) printed += map__fprintf_dsoname_dsoff(alt.map, PRINT_FIELD(DSOFF), alt.addr, fp); printed += print_bstack_flags(fp, entries + i); + addr_location__exit(&alt); + addr_location__exit(&alf); } return printed; @@ -1036,6 +1041,7 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, struct addr_location al; bool kernel; struct dso *dso; + int ret = 0; if (!start || !end) return 0; @@ -1057,7 +1063,6 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, return -ENXIO; } - memset(&al, 0, sizeof(al)); if (end - start > MAXBB - MAXINSN) { if (last) pr_debug("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end); @@ -1066,13 +1071,14 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, return 0; } + addr_location__init(&al); if (!thread__find_map(thread, *cpumode, start, &al) || (dso = map__dso(al.map)) == NULL) { pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); - return 0; + goto out; } if (dso->data.status == DSO_DATA_STATUS_ERROR) { pr_debug("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end); - return 0; + goto out; } /* Load maps to ensure dso->is_64_bit has been updated */ @@ -1086,7 +1092,10 @@ static int grab_bb(u8 *buffer, u64 start, u64 end, if (len <= 0) pr_debug("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n", start, end); - return len; + ret = len; +out: + addr_location__exit(&al); + return ret; } static int map__fprintf_srccode(struct map *map, u64 addr, FILE *fp, struct srccode_state *state) @@ -1137,14 +1146,16 @@ static int print_srccode(struct thread *thread, u8 cpumode, uint64_t addr) struct addr_location al; int ret = 0; - memset(&al, 0, sizeof(al)); + addr_location__init(&al); thread__find_map(thread, cpumode, addr, &al); if (!al.map) - return 0; + goto out; ret = map__fprintf_srccode(al.map, al.addr, stdout, thread__srccode_state(thread)); if (ret) ret += printf("\n"); +out: + addr_location__exit(&al); return ret; } @@ -1179,14 +1190,13 @@ static int ip__fprintf_sym(uint64_t addr, struct thread *thread, struct perf_event_attr *attr, FILE *fp) { struct addr_location al; - int off, printed = 0; - - memset(&al, 0, sizeof(al)); + int off, printed = 0, ret = 0; + addr_location__init(&al); thread__find_map(thread, cpumode, addr, &al); if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end) - return 0; + goto out; al.cpu = cpu; al.sym = NULL; @@ -1194,7 +1204,7 @@ static int ip__fprintf_sym(uint64_t addr, struct thread *thread, al.sym = map__find_symbol(al.map, al.addr); if (!al.sym) - return 0; + goto out; if (al.addr < al.sym->end) off = al.addr - al.sym->start; @@ -1209,7 +1219,10 @@ static int ip__fprintf_sym(uint64_t addr, struct thread *thread, printed += fprintf(fp, "\n"); *lastsym = al.sym; - return printed; + ret = printed; +out: + addr_location__exit(&al); + return ret; } static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, @@ -1371,6 +1384,7 @@ static int perf_sample__fprintf_addr(struct perf_sample *sample, struct addr_location al; int printed = fprintf(fp, "%16" PRIx64, sample->addr); + addr_location__init(&al); if (!sample_addr_correlates_sym(attr)) goto out; @@ -1387,6 +1401,7 @@ static int perf_sample__fprintf_addr(struct perf_sample *sample, if (PRINT_FIELD(DSO)) printed += map__fprintf_dsoname_dsoff(al.map, PRINT_FIELD(DSOFF), al.addr, fp); out: + addr_location__exit(&al); return printed; } @@ -2338,8 +2353,8 @@ static int process_sample_event(struct perf_tool *tool, int ret = 0; /* Set thread to NULL to indicate addr_al and al are not initialized */ - addr_al.thread = NULL; - al.thread = NULL; + addr_location__init(&al); + addr_location__init(&addr_al); ret = dlfilter__filter_event_early(dlfilter, event, sample, evsel, machine, &al, &addr_al); if (ret) { @@ -2405,8 +2420,8 @@ static int process_sample_event(struct perf_tool *tool, } out_put: - if (al.thread) - addr_location__put(&al); + addr_location__exit(&addr_al); + addr_location__exit(&al); return ret; } diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 829d99fecfd0..19d4542ea18a 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -498,7 +498,6 @@ static const char *cat_backtrace(union perf_event *event, char *p = NULL; size_t p_len; u8 cpumode = PERF_RECORD_MISC_USER; - struct addr_location tal; struct ip_callchain *chain = sample->callchain; FILE *f = open_memstream(&p, &p_len); @@ -507,6 +506,7 @@ static const char *cat_backtrace(union perf_event *event, return NULL; } + addr_location__init(&al); if (!chain) goto exit; @@ -518,6 +518,7 @@ static const char *cat_backtrace(union perf_event *event, for (i = 0; i < chain->nr; i++) { u64 ip; + struct addr_location tal; if (callchain_param.order == ORDER_CALLEE) ip = chain->ips[i]; @@ -544,20 +545,22 @@ static const char *cat_backtrace(union perf_event *event, * Discard all. */ zfree(&p); - goto exit_put; + goto exit; } continue; } + addr_location__init(&tal); tal.filtered = 0; if (thread__find_symbol(al.thread, cpumode, ip, &tal)) fprintf(f, "..... %016" PRIx64 " %s\n", ip, tal.sym->name); else fprintf(f, "..... %016" PRIx64 "\n", ip); + + addr_location__exit(&tal); } -exit_put: - addr_location__put(&al); exit: + addr_location__exit(&al); fclose(f); return p; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 9d3cbebb9b79..99010dfa5760 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -773,8 +773,9 @@ static void perf_event__process_sample(struct perf_tool *tool, if (event->header.misc & PERF_RECORD_MISC_EXACT_IP) top->exact_samples++; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) - return; + goto out; if (top->stitch_lbr) thread__set_lbr_stitch_enable(al.thread, true); @@ -848,7 +849,8 @@ static void perf_event__process_sample(struct perf_tool *tool, mutex_unlock(&hists->lock); } - addr_location__put(&al); +out: + addr_location__exit(&al); } static void diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 4c9bec39423b..6a1e75f06832 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2418,13 +2418,15 @@ static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel, int max_stack = evsel->core.attr.sample_max_stack ? evsel->core.attr.sample_max_stack : trace->max_stack; - int err; + int err = -1; + addr_location__init(&al); if (machine__resolve(trace->host, &al, sample) < 0) - return -1; + goto out; err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); - addr_location__put(&al); +out: + addr_location__exit(&al); return err; } @@ -2893,6 +2895,7 @@ static int trace__pgfault(struct trace *trace, int err = -1; int callchain_ret = 0; + addr_location__init(&al); thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { @@ -2953,6 +2956,7 @@ out: err = 0; out_put: thread__put(thread); + addr_location__exit(&al); return err; } diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 9d8eefbebd48..2a7b2b6f5286 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -241,6 +241,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, pr_debug("Reading object code for memory address: %#"PRIx64"\n", addr); + addr_location__init(&al); if (!thread__find_map(thread, cpumode, addr, &al) || !map__dso(al.map)) { if (cpumode == PERF_RECORD_MISC_HYPERVISOR) { pr_debug("Hypervisor address can not be resolved - skipping\n"); @@ -366,7 +367,7 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, } pr_debug("Bytes read match those read by objdump\n"); out: - map__put(al.map); + addr_location__exit(&al); return err; } diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 62b9c6461ea6..71dacb0fec4d 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -8,8 +8,8 @@ #include "util/evsel.h" #include "util/evlist.h" #include "util/machine.h" -#include "util/thread.h" #include "util/parse-events.h" +#include "util/thread.h" #include "tests/tests.h" #include "tests/hists_common.h" #include @@ -84,6 +84,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) struct perf_sample sample = { .period = 1000, }; size_t i; + addr_location__init(&al); for (i = 0; i < ARRAY_SIZE(fake_samples); i++) { struct hist_entry_iter iter = { .evsel = evsel, @@ -107,20 +108,22 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { - addr_location__put(&al); goto out; } - fake_samples[i].thread = al.thread; + thread__put(fake_samples[i].thread); + fake_samples[i].thread = thread__get(al.thread); map__put(fake_samples[i].map); - fake_samples[i].map = al.map; + fake_samples[i].map = map__get(al.map); fake_samples[i].sym = al.sym; } + addr_location__exit(&al); return TEST_OK; out: pr_debug("Not enough memory for adding a hist entry\n"); + addr_location__exit(&al); return TEST_FAIL; } @@ -152,8 +155,10 @@ static void put_fake_samples(void) { size_t i; - for (i = 0; i < ARRAY_SIZE(fake_samples); i++) - map__put(fake_samples[i].map); + for (i = 0; i < ARRAY_SIZE(fake_samples); i++) { + map__zput(fake_samples[i].map); + thread__zput(fake_samples[i].thread); + } } typedef int (*test_fn_t)(struct evsel *, struct machine *); diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c index 98eff5935a1c..4b2e4f2fbe48 100644 --- a/tools/perf/tests/hists_filter.c +++ b/tools/perf/tests/hists_filter.c @@ -8,6 +8,7 @@ #include "util/evlist.h" #include "util/machine.h" #include "util/parse-events.h" +#include "util/thread.h" #include "tests/tests.h" #include "tests/hists_common.h" #include @@ -53,6 +54,7 @@ static int add_hist_entries(struct evlist *evlist, struct perf_sample sample = { .period = 100, }; size_t i; + addr_location__init(&al); /* * each evsel will have 10 samples but the 4th sample * (perf [perf] main) will be collapsed to an existing entry @@ -84,21 +86,22 @@ static int add_hist_entries(struct evlist *evlist, al.socket = fake_samples[i].socket; if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { - addr_location__put(&al); goto out; } - fake_samples[i].thread = al.thread; + thread__put(fake_samples[i].thread); + fake_samples[i].thread = thread__get(al.thread); map__put(fake_samples[i].map); - fake_samples[i].map = al.map; + fake_samples[i].map = map__get(al.map); fake_samples[i].sym = al.sym; } } - + addr_location__exit(&al); return 0; out: pr_debug("Not enough memory for adding a hist entry\n"); + addr_location__exit(&al); return TEST_FAIL; } diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c index 141e2972e34f..12bad8840699 100644 --- a/tools/perf/tests/hists_link.c +++ b/tools/perf/tests/hists_link.c @@ -8,6 +8,7 @@ #include "machine.h" #include "map.h" #include "parse-events.h" +#include "thread.h" #include "hists_common.h" #include "util/mmap.h" #include @@ -70,6 +71,7 @@ static int add_hist_entries(struct evlist *evlist, struct machine *machine) struct perf_sample sample = { .period = 1, .weight = 1, }; size_t i = 0, k; + addr_location__init(&al); /* * each evsel will have 10 samples - 5 common and 5 distinct. * However the second evsel also has a collapsed entry for @@ -90,13 +92,13 @@ static int add_hist_entries(struct evlist *evlist, struct machine *machine) he = hists__add_entry(hists, &al, NULL, NULL, NULL, NULL, &sample, true); if (he == NULL) { - addr_location__put(&al); goto out; } - fake_common_samples[k].thread = al.thread; + thread__put(fake_common_samples[k].thread); + fake_common_samples[k].thread = thread__get(al.thread); map__put(fake_common_samples[k].map); - fake_common_samples[k].map = al.map; + fake_common_samples[k].map = map__get(al.map); fake_common_samples[k].sym = al.sym; } @@ -110,20 +112,22 @@ static int add_hist_entries(struct evlist *evlist, struct machine *machine) he = hists__add_entry(hists, &al, NULL, NULL, NULL, NULL, &sample, true); if (he == NULL) { - addr_location__put(&al); goto out; } - fake_samples[i][k].thread = al.thread; - fake_samples[i][k].map = al.map; + thread__put(fake_samples[i][k].thread); + fake_samples[i][k].thread = thread__get(al.thread); + map__put(fake_samples[i][k].map); + fake_samples[i][k].map = map__get(al.map); fake_samples[i][k].sym = al.sym; } i++; } + addr_location__exit(&al); return 0; - out: + addr_location__exit(&al); pr_debug("Not enough memory for adding a hist entry\n"); return -1; } diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index cd2094c13e1e..ba1cccf57049 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -54,6 +54,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) struct perf_sample sample = { .period = 100, }; size_t i; + addr_location__init(&al); for (i = 0; i < ARRAY_SIZE(fake_samples); i++) { struct hist_entry_iter iter = { .evsel = evsel, @@ -73,20 +74,21 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { - addr_location__put(&al); goto out; } fake_samples[i].thread = al.thread; map__put(fake_samples[i].map); - fake_samples[i].map = al.map; + fake_samples[i].map = map__get(al.map); fake_samples[i].sym = al.sym; } + addr_location__exit(&al); return TEST_OK; out: pr_debug("Not enough memory for adding a hist entry\n"); + addr_location__exit(&al); return TEST_FAIL; } @@ -118,8 +120,10 @@ static void put_fake_samples(void) { size_t i; - for (i = 0; i < ARRAY_SIZE(fake_samples); i++) + for (i = 0; i < ARRAY_SIZE(fake_samples); i++) { map__put(fake_samples[i].map); + fake_samples[i].map = NULL; + } } typedef int (*test_fn_t)(struct evsel *, struct machine *); diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index 898eda55b7a8..3891a2a3b46f 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -187,6 +187,7 @@ static int mmap_events(synth_cb synth) struct addr_location al; struct thread *thread; + addr_location__init(&al); thread = machine__findnew_thread(machine, getpid(), td->tid); pr_debug("looking for map %p\n", td->map); @@ -199,11 +200,12 @@ static int mmap_events(synth_cb synth) if (!al.map) { pr_debug("failed, couldn't find map\n"); err = -1; + addr_location__exit(&al); break; } pr_debug("map %p, addr %" PRIx64 "\n", al.map, map__start(al.map)); - map__put(al.map); + addr_location__exit(&al); } machine__delete_threads(machine); diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c index c73fc2aa236c..51825ef8c0ab 100644 --- a/tools/perf/util/addr_location.c +++ b/tools/perf/util/addr_location.c @@ -1,16 +1,44 @@ // SPDX-License-Identifier: GPL-2.0 #include "addr_location.h" #include "map.h" +#include "maps.h" #include "thread.h" +void addr_location__init(struct addr_location *al) +{ + al->thread = NULL; + al->maps = NULL; + al->map = NULL; + al->sym = NULL; + al->srcline = NULL; + al->addr = 0; + al->level = 0; + al->filtered = 0; + al->cpumode = 0; + al->cpu = 0; + al->socket = 0; +} + /* * The preprocess_sample method will return with reference counts for the * in it, when done using (and perhaps getting ref counts if needing to * keep a pointer to one of those entries) it must be paired with * addr_location__put(), so that the refcounts can be decremented. */ -void addr_location__put(struct addr_location *al) +void addr_location__exit(struct addr_location *al) { map__zput(al->map); thread__zput(al->thread); + maps__zput(al->maps); +} + +void addr_location__copy(struct addr_location *dst, struct addr_location *src) +{ + thread__put(dst->thread); + maps__put(dst->maps); + map__put(dst->map); + *dst = *src; + dst->thread = thread__get(src->thread); + dst->maps = maps__get(src->maps); + dst->map = map__get(src->map); } diff --git a/tools/perf/util/addr_location.h b/tools/perf/util/addr_location.h index 7dfa7417c0fe..d8ac0428dff2 100644 --- a/tools/perf/util/addr_location.h +++ b/tools/perf/util/addr_location.h @@ -23,6 +23,9 @@ struct addr_location { s32 socket; }; -void addr_location__put(struct addr_location *al); +void addr_location__init(struct addr_location *al); +void addr_location__exit(struct addr_location *al); + +void addr_location__copy(struct addr_location *dst, struct addr_location *src); #endif /* __PERF_ADDR_LOCATION */ diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 06a8cd88cbef..36728222a5b4 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -58,9 +58,11 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused, return -1; } + addr_location__init(&al); if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) map__dso(al.map)->hit = 1; + addr_location__exit(&al); thread__put(thread); return 0; } diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index b550c7393155..416f2ddc3895 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -910,33 +910,35 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, struct addr_location al; struct dso *dso; struct cs_etm_traceid_queue *tidq; + int ret = 0; if (!etmq) return 0; + addr_location__init(&al); machine = etmq->etm->machine; cpumode = cs_etm__cpu_mode(etmq, address); tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id); if (!tidq) - return 0; + goto out; thread = tidq->thread; if (!thread) { if (cpumode != PERF_RECORD_MISC_KERNEL) - return 0; + goto out; thread = etmq->etm->unknown_thread; } if (!thread__find_map(thread, cpumode, address, &al)) - return 0; + goto out; dso = map__dso(al.map); if (!dso) - return 0; + goto out; if (dso->data.status == DSO_DATA_STATUS_ERROR && dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) - return 0; + goto out; offset = map__map_ip(al.map, address); @@ -953,10 +955,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, dso->long_name ? dso->long_name : "Unknown"); dso->auxtrace_warned = true; } - return 0; + goto out; } - - return len; + ret = len; +out: + addr_location__exit(&al); + return ret; } static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 291591e303cd..5bb3c2ba95ca 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -154,12 +154,14 @@ static int process_sample_event(struct perf_tool *tool, { struct convert_json *c = container_of(tool, struct convert_json, tool); FILE *out = c->out; - struct addr_location al, tal; + struct addr_location al; u64 sample_type = __evlist__combined_sample_type(evsel->evlist); u8 cpumode = PERF_RECORD_MISC_USER; + addr_location__init(&al); if (machine__resolve(machine, &al, sample) < 0) { pr_err("Sample resolution failed!\n"); + addr_location__exit(&al); return -1; } @@ -190,6 +192,7 @@ static int process_sample_event(struct perf_tool *tool, for (i = 0; i < sample->callchain->nr; ++i) { u64 ip = sample->callchain->ips[i]; + struct addr_location tal; if (ip >= PERF_CONTEXT_MAX) { switch (ip) { @@ -215,8 +218,10 @@ static int process_sample_event(struct perf_tool *tool, else fputc(',', out); + addr_location__init(&tal); ok = thread__find_symbol(al.thread, cpumode, ip, &tal); output_sample_callchain_entry(tool, ip, ok ? &tal : NULL); + addr_location__exit(&tal); } } else { output_sample_callchain_entry(tool, sample->ip, &al); @@ -245,6 +250,7 @@ static int process_sample_event(struct perf_tool *tool, } #endif output_json_format(out, false, 2, "}"); + addr_location__exit(&al); return 0; } diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 751fd53bfd93..6184696dc266 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -239,16 +239,17 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, struct addr_location al; u64 dso_db_id = 0, sym_db_id = 0, offset = 0; - memset(&al, 0, sizeof(al)); node = callchain_cursor_current(&callchain_cursor); if (!node) break; + /* * Handle export of symbol and dso for this node by * constructing an addr_location struct and then passing it to * db_ids_from_al() to perform the export. */ + addr_location__init(&al); al.sym = node->ms.sym; al.map = node->ms.map; al.maps = thread__maps(thread); @@ -265,6 +266,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, kernel_start); callchain_cursor_advance(&callchain_cursor); + addr_location__exit(&al); } /* Reset the callchain order to its prior value. */ diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c index 8016f21dc0b8..46f74b2344db 100644 --- a/tools/perf/util/dlfilter.c +++ b/tools/perf/util/dlfilter.c @@ -258,6 +258,7 @@ static __s32 dlfilter__object_code(void *ctx, __u64 ip, void *buf, __u32 len) struct addr_location a; struct map *map; u64 offset; + __s32 ret; if (!d->ctx_valid) return -1; @@ -272,16 +273,22 @@ static __s32 dlfilter__object_code(void *ctx, __u64 ip, void *buf, __u32 len) machine__kernel_ip(d->machine, ip) == machine__kernel_ip(d->machine, d->sample->ip)) goto have_map; + addr_location__init(&a); thread__find_map_fb(al->thread, d->sample->cpumode, ip, &a); - if (!a.map) - return -1; + if (!a.map) { + ret = -1; + goto out; + } map = a.map; have_map: offset = map__map_ip(map, ip); if (ip + len >= map__end(map)) len = map__end(map) - ip; - return dso__data_read_offset(map__dso(map), d->machine, offset, buf, len); + ret = dso__data_read_offset(map__dso(map), d->machine, offset, buf, len); +out: + addr_location__exit(&a); + return ret; } static const struct perf_dlfilter_fns perf_dlfilter_fns = { diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 6ee23145ee7e..2fcfba38fc48 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -486,6 +486,7 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma if (machine) { struct addr_location al; + addr_location__init(&al); al.map = map__get(maps__find(machine__kernel_maps(machine), tp->addr)); if (al.map && map__load(al.map) >= 0) { al.addr = map__map_ip(al.map, tp->addr); @@ -493,7 +494,7 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma if (al.sym) ret += symbol__fprintf_symname_offs(al.sym, &al, fp); } - map__put(al.map); + addr_location__exit(&al); } ret += fprintf(fp, " old len %u new len %u\n", tp->old_len, tp->new_len); old = true; @@ -577,8 +578,10 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, struct machine *machine = maps__machine(maps); bool load_map = false; - al->maps = maps; - al->thread = thread; + maps__zput(al->maps); + map__zput(al->map); + thread__zput(al->thread); + al->addr = addr; al->cpumode = cpumode; al->filtered = 0; @@ -590,13 +593,13 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) { al->level = 'k'; - al->maps = maps = machine__kernel_maps(machine); + maps = machine__kernel_maps(machine); load_map = true; } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) { al->level = '.'; } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) { al->level = 'g'; - al->maps = maps = machine__kernel_maps(machine); + maps = machine__kernel_maps(machine); load_map = true; } else if (cpumode == PERF_RECORD_MISC_GUEST_USER && perf_guest) { al->level = 'u'; @@ -615,7 +618,8 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, return NULL; } - + al->maps = maps__get(maps); + al->thread = thread__get(thread); al->map = map__get(maps__find(maps, al->addr)); if (al->map != NULL) { /* diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index a1655fd7ed9b..cf45ca0e768f 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -128,8 +128,6 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, bool first = true; if (sample->callchain) { - struct addr_location node_al; - callchain_cursor_commit(cursor); while (1) { @@ -159,9 +157,12 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); if (print_sym) { + struct addr_location node_al; + + addr_location__init(&node_al); printed += fprintf(fp, " "); node_al.addr = addr; - node_al.map = map; + node_al.map = map__get(map); if (print_symoffset) { printed += __symbol__fprintf_symname_offs(sym, &node_al, @@ -171,6 +172,7 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, printed += __symbol__fprintf_symname(sym, &node_al, print_unknown_as_addr, fp); } + addr_location__exit(&node_al); } if (print_dso && (!sym || !sym->inlined)) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 4bc3affbe891..a4c1b617f6e4 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -588,7 +588,7 @@ static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period) static struct hist_entry *hists__findnew_entry(struct hists *hists, struct hist_entry *entry, - struct addr_location *al, + const struct addr_location *al, bool sample_self) { struct rb_node **p; @@ -927,8 +927,10 @@ iter_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *al) if (iter->curr >= iter->total) return 0; - al->maps = bi[i].to.ms.maps; - al->map = bi[i].to.ms.map; + maps__put(al->maps); + al->maps = maps__get(bi[i].to.ms.maps); + map__put(al->map); + al->map = map__get(bi[i].to.ms.map); al->sym = bi[i].to.ms.sym; al->addr = bi[i].to.addr; return 1; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 45c7e7722916..783ce61c6d25 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -754,13 +754,15 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, struct addr_location al; unsigned char buf[INTEL_PT_INSN_BUF_SZ]; ssize_t len; - int x86_64; + int x86_64, ret = 0; u8 cpumode; u64 offset, start_offset, start_ip; u64 insn_cnt = 0; bool one_map = true; bool nr; + + addr_location__init(&al); intel_pt_insn->length = 0; if (to_ip && *ip == to_ip) @@ -773,19 +775,22 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, if (ptq->pt->have_guest_sideband) { if (!ptq->guest_machine || ptq->guest_machine_pid != ptq->pid) { intel_pt_log("ERROR: guest sideband but no guest machine\n"); - return -EINVAL; + ret = -EINVAL; + goto out_ret; } } else if ((!symbol_conf.guest_code && cpumode != PERF_RECORD_MISC_GUEST_KERNEL) || intel_pt_get_guest(ptq)) { intel_pt_log("ERROR: no guest machine\n"); - return -EINVAL; + ret = -EINVAL; + goto out_ret; } machine = ptq->guest_machine; thread = ptq->guest_thread; if (!thread) { if (cpumode != PERF_RECORD_MISC_GUEST_KERNEL) { intel_pt_log("ERROR: no guest thread\n"); - return -EINVAL; + ret = -EINVAL; + goto out_ret; } thread = ptq->unknown_guest_thread; } @@ -794,7 +799,8 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, if (!thread) { if (cpumode != PERF_RECORD_MISC_KERNEL) { intel_pt_log("ERROR: no thread\n"); - return -EINVAL; + ret = -EINVAL; + goto out_ret; } thread = ptq->pt->unknown_thread; } @@ -808,13 +814,17 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, intel_pt_log("ERROR: thread has no dso for %#" PRIx64 "\n", *ip); else intel_pt_log("ERROR: thread has no map for %#" PRIx64 "\n", *ip); - return -EINVAL; + addr_location__exit(&al); + ret = -EINVAL; + goto out_ret; } dso = map__dso(al.map); if (dso->data.status == DSO_DATA_STATUS_ERROR && - dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) - return -ENOENT; + dso__data_status_seen(dso, DSO_DATA_STATUS_SEEN_ITRACE)) { + ret = -ENOENT; + goto out_ret; + } offset = map__map_ip(al.map, *ip); @@ -833,7 +843,8 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, intel_pt_insn->rel = e->rel; memcpy(intel_pt_insn->buf, e->insn, INTEL_PT_INSN_BUF_SZ); intel_pt_log_insn_no_data(intel_pt_insn, *ip); - return 0; + ret = 0; + goto out_ret; } } @@ -854,11 +865,14 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, offset); if (intel_pt_enable_logging) dso__fprintf(dso, intel_pt_log_fp()); - return -EINVAL; + ret = -EINVAL; + goto out_ret; } - if (intel_pt_get_insn(buf, len, x86_64, intel_pt_insn)) - return -EINVAL; + if (intel_pt_get_insn(buf, len, x86_64, intel_pt_insn)) { + ret = -EINVAL; + goto out_ret; + } intel_pt_log_insn(intel_pt_insn, *ip); @@ -909,17 +923,20 @@ out: e = intel_pt_cache_lookup(map__dso(al.map), machine, start_offset); if (e) - return 0; + goto out_ret; } /* Ignore cache errors */ intel_pt_cache_add(map__dso(al.map), machine, start_offset, insn_cnt, *ip - start_ip, intel_pt_insn); - return 0; +out_ret: + addr_location__exit(&al); + return ret; out_no_cache: *insn_cnt_ptr = insn_cnt; + addr_location__exit(&al); return 0; } @@ -968,6 +985,7 @@ static int __intel_pt_pgd_ip(uint64_t ip, void *data) struct addr_location al; u8 cpumode; u64 offset; + int res; if (ptq->state->to_nr) { if (intel_pt_guest_kernel_ip(ip)) @@ -984,12 +1002,15 @@ static int __intel_pt_pgd_ip(uint64_t ip, void *data) if (!thread) return -EINVAL; + addr_location__init(&al); if (!thread__find_map(thread, cpumode, ip, &al) || !map__dso(al.map)) return -EINVAL; offset = map__map_ip(al.map, ip); - return intel_pt_match_pgd_ip(ptq->pt, ip, offset, map__dso(al.map)->long_name); + res = intel_pt_match_pgd_ip(ptq->pt, ip, offset, map__dso(al.map)->long_name); + addr_location__exit(&al); + return res; } static bool intel_pt_pgd_ip(uint64_t ip, void *data) @@ -3372,20 +3393,22 @@ static int intel_pt_text_poke(struct intel_pt *pt, union perf_event *event) /* Assume text poke begins in a basic block no more than 4096 bytes */ int cnt = 4096 + event->text_poke.new_len; struct thread *thread = pt->unknown_thread; - struct addr_location al = { .map = NULL }; + struct addr_location al; struct machine *machine = pt->machine; struct intel_pt_cache_entry *e; u64 offset; + int ret = 0; + addr_location__init(&al); if (!event->text_poke.new_len) - return 0; + goto out; for (; cnt; cnt--, addr--) { struct dso *dso; if (intel_pt_find_map(thread, cpumode, addr, &al)) { if (addr < event->text_poke.addr) - return 0; + goto out; continue; } @@ -3406,15 +3429,16 @@ static int intel_pt_text_poke(struct intel_pt *pt, union perf_event *event) * branch instruction before the text poke address. */ if (e->branch != INTEL_PT_BR_NO_BRANCH) - return 0; + goto out; } else { intel_pt_cache_invalidate(dso, machine, offset); intel_pt_log("Invalidated instruction cache for %s at %#"PRIx64"\n", dso->long_name, addr); } } - - return 0; +out: + addr_location__exit(&al); + return ret; } static int intel_pt_process_event(struct perf_session *session, diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 8972c852d3bd..9fcf357a4d53 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2221,7 +2221,7 @@ static void ip__resolve_ams(struct thread *thread, { struct addr_location al; - memset(&al, 0, sizeof(al)); + addr_location__init(&al); /* * We cannot use the header.misc hint to determine whether a * branch stack address is user, kernel, guest, hypervisor. @@ -2234,11 +2234,12 @@ static void ip__resolve_ams(struct thread *thread, ams->addr = ip; ams->al_addr = al.addr; ams->al_level = al.level; - ams->ms.maps = al.maps; + ams->ms.maps = maps__get(al.maps); ams->ms.sym = al.sym; - ams->ms.map = al.map; + ams->ms.map = map__get(al.map); ams->phys_addr = 0; ams->data_page_size = 0; + addr_location__exit(&al); } static void ip__resolve_data(struct thread *thread, @@ -2247,18 +2248,19 @@ static void ip__resolve_data(struct thread *thread, { struct addr_location al; - memset(&al, 0, sizeof(al)); + addr_location__init(&al); thread__find_symbol(thread, m, addr, &al); ams->addr = addr; ams->al_addr = al.addr; ams->al_level = al.level; - ams->ms.maps = al.maps; + ams->ms.maps = maps__get(al.maps); ams->ms.sym = al.sym; - ams->ms.map = al.map; + ams->ms.map = map__get(al.map); ams->phys_addr = phys_addr; ams->data_page_size = daddr_page_size; + addr_location__exit(&al); } struct mem_info *sample__resolve_mem(struct perf_sample *sample, @@ -2319,10 +2321,11 @@ static int add_callchain_ip(struct thread *thread, { struct map_symbol ms; struct addr_location al; - int nr_loop_iter = 0, err; + int nr_loop_iter = 0, err = 0; u64 iter_cycles = 0; const char *srcline = NULL; + addr_location__init(&al); al.filtered = 0; al.sym = NULL; al.srcline = NULL; @@ -2348,9 +2351,10 @@ static int add_callchain_ip(struct thread *thread, * Discard all. */ callchain_cursor_reset(cursor); - return 1; + err = 1; + goto out; } - return 0; + goto out; } thread__find_symbol(thread, *cpumode, ip, &al); } @@ -2363,31 +2367,32 @@ static int add_callchain_ip(struct thread *thread, symbol__match_regex(al.sym, &ignore_callees_regex)) { /* Treat this symbol as the root, forgetting its callees. */ - *root_al = al; + addr_location__copy(root_al, &al); callchain_cursor_reset(cursor); } } if (symbol_conf.hide_unresolved && al.sym == NULL) - return 0; + goto out; if (iter) { nr_loop_iter = iter->nr_loop_iter; iter_cycles = iter->cycles; } - ms.maps = al.maps; - ms.map = al.map; + ms.maps = maps__get(al.maps); + ms.map = map__get(al.map); ms.sym = al.sym; if (!branch && append_inlines(cursor, &ms, ip) == 0) - return 0; + goto out; srcline = callchain_srcline(&ms, al.addr); err = callchain_cursor_append(cursor, ip, &ms, branch, flags, nr_loop_iter, iter_cycles, branch_from, srcline); - map__put(al.map); +out: + addr_location__exit(&al); return err; } diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index f3d262e871ac..d7c99028c6e6 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -469,9 +469,11 @@ static PyObject *python_process_callchain(struct perf_sample *sample, struct addr_location node_al; unsigned long offset; + addr_location__init(&node_al); node_al.addr = map__map_ip(map, node->ip); - node_al.map = map; + node_al.map = map__get(map); offset = get_offset(node->ms.sym, &node_al); + addr_location__exit(&node_al); pydict_set_item_string_decref( pyelem, "sym_off", @@ -539,6 +541,7 @@ static PyObject *python_process_brstack(struct perf_sample *sample, pydict_set_item_string_decref(pyelem, "cycles", PyLong_FromUnsignedLongLong(entries[i].flags.cycles)); + addr_location__init(&al); thread__find_map_fb(thread, sample->cpumode, entries[i].from, &al); dsoname = get_dsoname(al.map); @@ -551,6 +554,7 @@ static PyObject *python_process_brstack(struct perf_sample *sample, pydict_set_item_string_decref(pyelem, "to_dsoname", _PyUnicode_FromString(dsoname)); + addr_location__exit(&al); PyList_Append(pylist, pyelem); Py_DECREF(pyelem); } @@ -594,7 +598,6 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, PyObject *pylist; u64 i; char bf[512]; - struct addr_location al; pylist = PyList_New(0); if (!pylist) @@ -605,7 +608,9 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, for (i = 0; i < br->nr; i++) { PyObject *pyelem; + struct addr_location al; + addr_location__init(&al); pyelem = PyDict_New(); if (!pyelem) Py_FatalError("couldn't create Python dictionary"); @@ -644,6 +649,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, PyList_Append(pylist, pyelem); Py_DECREF(pyelem); + addr_location__exit(&al); } exit: diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 9a1db3be6436..bee4ac1051ee 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -432,18 +432,25 @@ int thread__memcpy(struct thread *thread, struct machine *machine, if (machine__kernel_ip(machine, ip)) cpumode = PERF_RECORD_MISC_KERNEL; - if (!thread__find_map(thread, cpumode, ip, &al)) - return -1; + addr_location__init(&al); + if (!thread__find_map(thread, cpumode, ip, &al)) { + addr_location__exit(&al); + return -1; + } dso = map__dso(al.map); - if( !dso || dso->data.status == DSO_DATA_STATUS_ERROR || map__load(al.map) < 0) + if (!dso || dso->data.status == DSO_DATA_STATUS_ERROR || map__load(al.map) < 0) { + addr_location__exit(&al); return -1; + } offset = map__map_ip(al.map, ip); if (is64bit) *is64bit = dso->is_64_bit; + addr_location__exit(&al); + return dso__data_read_offset(dso, machine, offset, buf, len); } diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 3723b5e31b2a..83eea968482e 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -90,8 +90,12 @@ static int __report_module(struct addr_location *al, u64 ip, static int report_module(u64 ip, struct unwind_info *ui) { struct addr_location al; + int res; - return __report_module(&al, ip, ui); + addr_location__init(&al); + res = __report_module(&al, ip, ui); + addr_location__exit(&al); + return res; } /* @@ -104,8 +108,11 @@ static int entry(u64 ip, struct unwind_info *ui) struct unwind_entry *e = &ui->entries[ui->idx++]; struct addr_location al; - if (__report_module(&al, ip, ui)) + addr_location__init(&al); + if (__report_module(&al, ip, ui)) { + addr_location__exit(&al); return -1; + } e->ip = ip; e->ms.maps = al.maps; @@ -116,6 +123,7 @@ static int entry(u64 ip, struct unwind_info *ui) al.sym ? al.sym->name : "''", ip, al.map ? map__map_ip(al.map, ip) : (u64) 0); + addr_location__exit(&al); return 0; } @@ -136,17 +144,22 @@ static int access_dso_mem(struct unwind_info *ui, Dwarf_Addr addr, ssize_t size; struct dso *dso; + addr_location__init(&al); if (!thread__find_map(ui->thread, PERF_RECORD_MISC_USER, addr, &al)) { pr_debug("unwind: no map for %lx\n", (unsigned long)addr); - return -1; + goto out_fail; } dso = map__dso(al.map); if (!dso) - return -1; + goto out_fail; size = dso__data_read_addr(dso, al.map, ui->machine, addr, (u8 *) data, sizeof(*data)); + addr_location__exit(&al); return !(size == sizeof(*data)); +out_fail: + addr_location__exit(&al); + return -1; } static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *result, diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 11f3fc95aa11..36bf5100bad2 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -416,7 +416,12 @@ static int read_unwind_spec_debug_frame(struct dso *dso, static struct map *find_map(unw_word_t ip, struct unwind_info *ui) { struct addr_location al; - return thread__find_map(ui->thread, PERF_RECORD_MISC_USER, ip, &al); + struct map *ret; + + addr_location__init(&al); + ret = thread__find_map(ui->thread, PERF_RECORD_MISC_USER, ip, &al); + addr_location__exit(&al); + return ret; } static int @@ -631,7 +636,9 @@ static int entry(u64 ip, struct thread *thread, { struct unwind_entry e; struct addr_location al; + int ret; + addr_location__init(&al); e.ms.sym = thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al); e.ip = ip; e.ms.map = al.map; @@ -642,7 +649,9 @@ static int entry(u64 ip, struct thread *thread, ip, al.map ? map__map_ip(al.map, ip) : (u64) 0); - return cb(&e, arg); + ret = cb(&e, arg); + addr_location__exit(&al); + return ret; } static void display_error(int err) -- cgit v1.2.3 From f6005cafebab72f8c02100dc896d6cfd5b8918cb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:04 -0700 Subject: perf thread: Add reference count checking Modify struct declaration and accessor functions for the reference count checkers additional layer of indirection. Make sure pid_cmp in builtin-sched.c uses the underlying/original struct in pointer arithmetic, and not the temporary get/put indirection. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 4 +-- tools/perf/tests/hists_link.c | 2 +- tools/perf/ui/hist.c | 5 ++- tools/perf/util/hist.c | 2 +- tools/perf/util/machine.c | 2 +- tools/perf/util/sort.c | 2 +- tools/perf/util/thread.c | 20 +++++++---- tools/perf/util/thread.h | 79 ++++++++++++++++++++++--------------------- 8 files changed, 63 insertions(+), 53 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index c75ad82a6729..cd79068200e5 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1385,7 +1385,7 @@ static int pid_cmp(struct work_atoms *l, struct work_atoms *r) { pid_t l_tid, r_tid; - if (l->thread == r->thread) + if (RC_CHK_ACCESS(l->thread) == RC_CHK_ACCESS(r->thread)) return 0; l_tid = thread__tid(l->thread); r_tid = thread__tid(r->thread); @@ -1393,7 +1393,7 @@ static int pid_cmp(struct work_atoms *l, struct work_atoms *r) return -1; if (l_tid > r_tid) return 1; - return (int)(l->thread - r->thread); + return (int)(RC_CHK_ACCESS(l->thread) - RC_CHK_ACCESS(r->thread)); } static int avg_cmp(struct work_atoms *l, struct work_atoms *r) diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c index 12bad8840699..2d19657ab5e0 100644 --- a/tools/perf/tests/hists_link.c +++ b/tools/perf/tests/hists_link.c @@ -148,7 +148,7 @@ static int find_sample(struct sample *samples, size_t nr_samples, struct thread *t, struct map *m, struct symbol *s) { while (nr_samples--) { - if (samples->thread == t && + if (RC_CHK_ACCESS(samples->thread) == RC_CHK_ACCESS(t) && RC_CHK_ACCESS(samples->map) == RC_CHK_ACCESS(m) && samples->sym == s) return 1; diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index f164bd26fc41..2bf959d08354 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -11,6 +11,7 @@ #include "../util/sort.h" #include "../util/evsel.h" #include "../util/evlist.h" +#include "../util/thread.h" #include "../util/util.h" /* hist period print (hpp) functions */ @@ -274,7 +275,9 @@ static int __hpp__sort_acc(struct hist_entry *a, struct hist_entry *b, if (ret) return ret; - if (a->thread != b->thread || !hist_entry__has_callchains(a) || !symbol_conf.use_callchain) + if ((a->thread == NULL ? NULL : RC_CHK_ACCESS(a->thread)) != + (b->thread == NULL ? NULL : RC_CHK_ACCESS(b->thread)) || + !hist_entry__has_callchains(a) || !symbol_conf.use_callchain) return 0; ret = b->callchain->max_depth - a->callchain->max_depth; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index a4c1b617f6e4..dfda52d348a3 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2124,7 +2124,7 @@ static bool hists__filter_entry_by_thread(struct hists *hists, struct hist_entry *he) { if (hists->thread_filter != NULL && - he->thread != hists->thread_filter) { + RC_CHK_ACCESS(he->thread) != RC_CHK_ACCESS(hists->thread_filter)) { he->filtered |= (1 << HIST_FILTER__THREAD); return true; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 9fcf357a4d53..261188766307 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2055,7 +2055,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread_rb_n if (!nd) nd = thread_rb_node__find(th, &threads->entries.rb_root); - if (threads->last_match == th) + if (threads->last_match && RC_CHK_ACCESS(threads->last_match) == RC_CHK_ACCESS(th)) threads__set_last_match(threads, NULL); if (lock) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 5e45c770f91d..047c3606802f 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -128,7 +128,7 @@ static int hist_entry__thread_filter(struct hist_entry *he, int type, const void if (type != HIST_FILTER__THREAD) return -1; - return th && he->thread != th; + return th && RC_CHK_ACCESS(he->thread) != RC_CHK_ACCESS(th); } struct sort_entry sort_thread = { diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index bee4ac1051ee..0b166404c5c3 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -41,9 +41,10 @@ struct thread *thread__new(pid_t pid, pid_t tid) { char *comm_str; struct comm *comm; - struct thread *thread = zalloc(sizeof(*thread)); + RC_STRUCT(thread) *_thread = zalloc(sizeof(*_thread)); + struct thread *thread; - if (thread != NULL) { + if (ADD_RC_CHK(thread, _thread) != NULL) { thread__set_pid(thread, pid); thread__set_tid(thread, tid); thread__set_ppid(thread, -1); @@ -68,7 +69,7 @@ struct thread *thread__new(pid_t pid, pid_t tid) list_add(&comm->list, thread__comm_list(thread)); refcount_set(thread__refcnt(thread), 1); /* Thread holds first ref to nsdata. */ - thread->nsinfo = nsinfo__new(pid); + RC_CHK_ACCESS(thread)->nsinfo = nsinfo__new(pid); srccode_state_init(thread__srccode_state(thread)); } @@ -105,26 +106,31 @@ void thread__delete(struct thread *thread) } up_write(thread__comm_lock(thread)); - nsinfo__zput(thread->nsinfo); + nsinfo__zput(RC_CHK_ACCESS(thread)->nsinfo); srccode_state_free(thread__srccode_state(thread)); exit_rwsem(thread__namespaces_lock(thread)); exit_rwsem(thread__comm_lock(thread)); thread__free_stitch_list(thread); - free(thread); + RC_CHK_FREE(thread); } struct thread *thread__get(struct thread *thread) { - if (thread) + struct thread *result; + + if (RC_CHK_GET(result, thread)) refcount_inc(thread__refcnt(thread)); - return thread; + + return result; } void thread__put(struct thread *thread) { if (thread && refcount_dec_and_test(thread__refcnt(thread))) thread__delete(thread); + else + RC_CHK_PUT(thread); } static struct namespaces *__thread__namespaces(struct thread *thread) diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index b103992c3831..9068a21ce0fa 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -15,6 +15,7 @@ #include "rwsem.h" #include "event.h" #include "callchain.h" +#include struct addr_location; struct map; @@ -34,7 +35,7 @@ struct thread_rb_node { struct thread *thread; }; -struct thread { +DECLARE_RC_STRUCT(thread) { struct maps *maps; pid_t pid_; /* Not all tools update this */ pid_t tid; @@ -123,192 +124,192 @@ int thread__memcpy(struct thread *thread, struct machine *machine, static inline struct maps *thread__maps(struct thread *thread) { - return thread->maps; + return RC_CHK_ACCESS(thread)->maps; } static inline void thread__set_maps(struct thread *thread, struct maps *maps) { - thread->maps = maps; + RC_CHK_ACCESS(thread)->maps = maps; } static inline pid_t thread__pid(const struct thread *thread) { - return thread->pid_; + return RC_CHK_ACCESS(thread)->pid_; } static inline void thread__set_pid(struct thread *thread, pid_t pid_) { - thread->pid_ = pid_; + RC_CHK_ACCESS(thread)->pid_ = pid_; } static inline pid_t thread__tid(const struct thread *thread) { - return thread->tid; + return RC_CHK_ACCESS(thread)->tid; } static inline void thread__set_tid(struct thread *thread, pid_t tid) { - thread->tid = tid; + RC_CHK_ACCESS(thread)->tid = tid; } static inline pid_t thread__ppid(const struct thread *thread) { - return thread->ppid; + return RC_CHK_ACCESS(thread)->ppid; } static inline void thread__set_ppid(struct thread *thread, pid_t ppid) { - thread->ppid = ppid; + RC_CHK_ACCESS(thread)->ppid = ppid; } static inline int thread__cpu(const struct thread *thread) { - return thread->cpu; + return RC_CHK_ACCESS(thread)->cpu; } static inline void thread__set_cpu(struct thread *thread, int cpu) { - thread->cpu = cpu; + RC_CHK_ACCESS(thread)->cpu = cpu; } static inline int thread__guest_cpu(const struct thread *thread) { - return thread->guest_cpu; + return RC_CHK_ACCESS(thread)->guest_cpu; } static inline void thread__set_guest_cpu(struct thread *thread, int guest_cpu) { - thread->guest_cpu = guest_cpu; + RC_CHK_ACCESS(thread)->guest_cpu = guest_cpu; } static inline refcount_t *thread__refcnt(struct thread *thread) { - return &thread->refcnt; + return &RC_CHK_ACCESS(thread)->refcnt; } static inline bool thread__comm_set(const struct thread *thread) { - return thread->comm_set; + return RC_CHK_ACCESS(thread)->comm_set; } static inline void thread__set_comm_set(struct thread *thread, bool set) { - thread->comm_set = set; + RC_CHK_ACCESS(thread)->comm_set = set; } static inline int thread__var_comm_len(const struct thread *thread) { - return thread->comm_len; + return RC_CHK_ACCESS(thread)->comm_len; } static inline void thread__set_comm_len(struct thread *thread, int len) { - thread->comm_len = len; + RC_CHK_ACCESS(thread)->comm_len = len; } static inline struct list_head *thread__namespaces_list(struct thread *thread) { - return &thread->namespaces_list; + return &RC_CHK_ACCESS(thread)->namespaces_list; } static inline int thread__namespaces_list_empty(const struct thread *thread) { - return list_empty(&thread->namespaces_list); + return list_empty(&RC_CHK_ACCESS(thread)->namespaces_list); } static inline struct rw_semaphore *thread__namespaces_lock(struct thread *thread) { - return &thread->namespaces_lock; + return &RC_CHK_ACCESS(thread)->namespaces_lock; } static inline struct list_head *thread__comm_list(struct thread *thread) { - return &thread->comm_list; + return &RC_CHK_ACCESS(thread)->comm_list; } static inline struct rw_semaphore *thread__comm_lock(struct thread *thread) { - return &thread->comm_lock; + return &RC_CHK_ACCESS(thread)->comm_lock; } static inline u64 thread__db_id(const struct thread *thread) { - return thread->db_id; + return RC_CHK_ACCESS(thread)->db_id; } static inline void thread__set_db_id(struct thread *thread, u64 db_id) { - thread->db_id = db_id; + RC_CHK_ACCESS(thread)->db_id = db_id; } static inline void *thread__priv(struct thread *thread) { - return thread->priv; + return RC_CHK_ACCESS(thread)->priv; } static inline void thread__set_priv(struct thread *thread, void *p) { - thread->priv = p; + RC_CHK_ACCESS(thread)->priv = p; } static inline struct thread_stack *thread__ts(struct thread *thread) { - return thread->ts; + return RC_CHK_ACCESS(thread)->ts; } static inline void thread__set_ts(struct thread *thread, struct thread_stack *ts) { - thread->ts = ts; + RC_CHK_ACCESS(thread)->ts = ts; } static inline struct nsinfo *thread__nsinfo(struct thread *thread) { - return thread->nsinfo; + return RC_CHK_ACCESS(thread)->nsinfo; } static inline struct srccode_state *thread__srccode_state(struct thread *thread) { - return &thread->srccode_state; + return &RC_CHK_ACCESS(thread)->srccode_state; } static inline bool thread__filter(const struct thread *thread) { - return thread->filter; + return RC_CHK_ACCESS(thread)->filter; } static inline void thread__set_filter(struct thread *thread, bool filter) { - thread->filter = filter; + RC_CHK_ACCESS(thread)->filter = filter; } static inline int thread__filter_entry_depth(const struct thread *thread) { - return thread->filter_entry_depth; + return RC_CHK_ACCESS(thread)->filter_entry_depth; } static inline void thread__set_filter_entry_depth(struct thread *thread, int depth) { - thread->filter_entry_depth = depth; + RC_CHK_ACCESS(thread)->filter_entry_depth = depth; } static inline bool thread__lbr_stitch_enable(const struct thread *thread) { - return thread->lbr_stitch_enable; + return RC_CHK_ACCESS(thread)->lbr_stitch_enable; } static inline void thread__set_lbr_stitch_enable(struct thread *thread, bool en) { - thread->lbr_stitch_enable = en; + RC_CHK_ACCESS(thread)->lbr_stitch_enable = en; } static inline struct lbr_stitch *thread__lbr_stitch(struct thread *thread) { - return thread->lbr_stitch; + return RC_CHK_ACCESS(thread)->lbr_stitch; } static inline void thread__set_lbr_stitch(struct thread *thread, struct lbr_stitch *lbrs) { - thread->lbr_stitch = lbrs; + RC_CHK_ACCESS(thread)->lbr_stitch = lbrs; } static inline bool thread__is_filtered(struct thread *thread) -- cgit v1.2.3 From cf078c838181366867091b024ff351ea2a414e0c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:05 -0700 Subject: perf machine: Make delete_threads part of machine__exit The code required threads to be deleted before machine__exit was called or the threads would be leaked. This was error prone so move the delete_threads into machine__exit. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/code-reading.c | 1 - tools/perf/tests/dwarf-unwind.c | 1 - tools/perf/tests/mmap-thread-lookup.c | 1 - tools/perf/tests/symbols.c | 1 - tools/perf/util/machine.c | 1 + tools/perf/util/session.c | 6 ------ 6 files changed, 1 insertion(+), 10 deletions(-) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 2a7b2b6f5286..ed3815163d1b 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -721,7 +721,6 @@ out_err: evlist__delete(evlist); perf_cpu_map__put(cpus); perf_thread_map__put(threads); - machine__delete_threads(machine); machine__delete(machine); return err; diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index ee983b677a6a..d01aa931fe81 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -235,7 +235,6 @@ noinline int test__dwarf_unwind(struct test_suite *test __maybe_unused, thread__put(thread); out: - machine__delete_threads(machine); machine__delete(machine); return err; } diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c index 3891a2a3b46f..ddd1da9a4ba9 100644 --- a/tools/perf/tests/mmap-thread-lookup.c +++ b/tools/perf/tests/mmap-thread-lookup.c @@ -208,7 +208,6 @@ static int mmap_events(synth_cb synth) addr_location__exit(&al); } - machine__delete_threads(machine); machine__delete(machine); return err; } diff --git a/tools/perf/tests/symbols.c b/tools/perf/tests/symbols.c index 2d1aa42d36a9..16e1c5502b09 100644 --- a/tools/perf/tests/symbols.c +++ b/tools/perf/tests/symbols.c @@ -38,7 +38,6 @@ static int init_test_info(struct test_info *ti) static void exit_test_info(struct test_info *ti) { thread__put(ti->thread); - machine__delete_threads(ti->machine); machine__delete(ti->machine); } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 261188766307..46af5e9748c9 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -256,6 +256,7 @@ void machine__exit(struct machine *machine) zfree(&machine->current_tid); zfree(&machine->kallsyms_filename); + machine__delete_threads(machine); for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 65ac9f7fdf7e..00d18c74c090 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -278,11 +278,6 @@ struct perf_session *__perf_session__new(struct perf_data *data, return ERR_PTR(ret); } -static void perf_session__delete_threads(struct perf_session *session) -{ - machine__delete_threads(&session->machines.host); -} - static void perf_decomp__release_events(struct decomp *next) { struct decomp *decomp; @@ -305,7 +300,6 @@ void perf_session__delete(struct perf_session *session) auxtrace__free(session); auxtrace_index__free(&session->auxtrace_index); perf_session__destroy_kernel_maps(session); - perf_session__delete_threads(session); perf_decomp__release_events(session->decomp_data.decomp); perf_env__exit(&session->header.env); machines__exit(&session->machines); -- cgit v1.2.3 From 2c9f7bd7951af269afe9680746f4c566c3abc769 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:06 -0700 Subject: perf report: Avoid 'parent_thread' thread leak on '--tasks' processing Caught with address sanitizer and reference count checking. Committer notes: The command leading to this leak: # perf record -a sleep 2 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 2.516 MB perf.data (6422 samples) ] # perf report --tasks # pid tid ppid comm 0 0 -1 |swapper 1 1 0 | systemd 1474 1474 1 | systemd 2816 2816 1474 | gjs 2816 2825 2816 | gmain 2816 2831 2816 | gdbus 2816 2861 2816 | JS Helper 2816 2862 2816 | JS Helper 2816 2863 2816 | JS Helper 2816 2864 2816 | JS Helper 2816 2865 2816 | JS Helper 2816 2866 2816 | JS Helper 2816 2867 2816 | JS Helper 2816 2868 2816 | JS Helper 3072 3072 1474 | gsd-printer 3072 3082 3072 | gmain 3072 3083 3072 | gdbus 2600 2600 1474 | gnome-shell 15621 15621 2600 | firefox 15771 15771 15621 | WebExtensions 15771 15872 15771 | TaskCon~ller #6 15771 15873 15771 | TaskCon~ller #7 15771 15778 15771 | IPC I/O Child 15771 15779 15771 | Socket Thread 15771 15780 15771 | HTML5 Parser 15771 15781 15771 | JS Watchdog # When it is going to exit a thread__put(parent_thread) was missed, add it to have ASAN clean. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 0b091a8983a5..a31a23af5547 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -839,6 +839,7 @@ static struct task *tasks_list(struct task *task, struct machine *machine) return ERR_PTR(-ENOENT); parent_task = thread__priv(parent_thread); + thread__put(parent_thread); list_add_tail(&task->list, &parent_task->children); return tasks_list(parent_task, machine); } -- cgit v1.2.3 From f8e502b9d1b3b1979590b3fc0f9b2a65fedfcb9b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:07 -0700 Subject: perf header: Ensure bitmaps are freed memory_node bitmaps need a bitmap_free to avoid memory leaks. Caught by leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d85b39079c31..3db7c1fae71e 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1389,6 +1389,14 @@ static int memory_node__read(struct memory_node *n, unsigned long idx) return 0; } +static void memory_node__delete_nodes(struct memory_node *nodesp, u64 cnt) +{ + for (u64 i = 0; i < cnt; i++) + bitmap_free(nodesp[i].set); + + free(nodesp); +} + static int memory_node__sort(const void *a, const void *b) { const struct memory_node *na = a; @@ -1449,7 +1457,7 @@ out: *nodesp = nodes; qsort(nodes, cnt, sizeof(nodes[0]), memory_node__sort); } else - free(nodes); + memory_node__delete_nodes(nodes, cnt); return ret; } @@ -1516,7 +1524,7 @@ static int write_mem_topology(struct feat_fd *ff __maybe_unused, } out: - free(nodes); + memory_node__delete_nodes(nodes, nr); return ret; } -- cgit v1.2.3 From 2b87be183bca9774c8ce238f5fc84d3b3f671b33 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:08 -0700 Subject: perf stat: Avoid evlist leak Free evlist before overwriting in "perf stat report" mode. Detected using leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c87c6897edc9..fc615bdeed4f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2427,6 +2427,7 @@ static int __cmd_report(int argc, const char **argv) perf_stat.session = session; stat_config.output = stderr; + evlist__delete(evsel_list); evsel_list = session->evlist; ret = perf_session__process_events(session); -- cgit v1.2.3 From 084770f55acb41505258f5035387312fd6f0592f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:09 -0700 Subject: perf intel-pt: Fix missed put and leak Add missing put and free, detected with leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 783ce61c6d25..dbf0bc71a63b 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1280,6 +1280,7 @@ static void intel_pt_add_br_stack(struct intel_pt *pt, pt->kernel_start); sample->branch_stack = pt->br_stack; + thread__put(thread); } /* INTEL_PT_LBR_0, INTEL_PT_LBR_1 and INTEL_PT_LBR_2 */ @@ -3580,6 +3581,7 @@ static void intel_pt_free(struct perf_session *session) zfree(&pt->chain); zfree(&pt->filter); zfree(&pt->time_ranges); + zfree(&pt->br_stack); free(pt); } -- cgit v1.2.3 From ac873ac32618dd1ce9a46ade575a421f0e1bf779 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:10 -0700 Subject: perf evlist: Free stats in all evlist destruction There is no evsel free stats, freeing in the evlist__delete ensures memory leaks are avoided. Issues detected with "perf stat report" and leak sanitizer, perf stat uses perf_session__delete to free the evlist. Add dummy symbol for python build. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 2 ++ tools/perf/util/python.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 82c0b3d0c822..7ef43f72098e 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -31,6 +31,7 @@ #include "util/pmu.h" #include "util/sample.h" #include "util/bpf-filter.h" +#include "util/stat.h" #include "util/util.h" #include #include @@ -171,6 +172,7 @@ void evlist__delete(struct evlist *evlist) if (evlist == NULL) return; + evlist__free_stats(evlist); evlist__munmap(evlist); evlist__close(evlist); evlist__purge(evlist); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 8de1b759bbaa..a7b2cb05dc86 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -1494,3 +1494,7 @@ void test_attr__open(struct perf_event_attr *attr, pid_t pid, struct perf_cpu cp int fd, int group_fd, unsigned long flags) { } + +void evlist__free_stats(struct evlist *evlist) +{ +} -- cgit v1.2.3 From 51cfe7a3e87ed760f6604ad4bbce898b9a3f8f92 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:11 -0700 Subject: perf python: Avoid 2 leak sanitizer issues Leak sanitizer complains about the variable size bf allocation and store to bf if sized 0. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/scripting-engines/trace-event-python.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index d7c99028c6e6..d96e5c0fef45 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -735,6 +735,9 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch unsigned int i = 0, r; int printed = 0; + if (size <= 0) + return; + bf[0] = 0; if (!regs || !regs->regs) @@ -764,7 +767,7 @@ static void set_regs_in_dict(PyObject *dict, * 10 chars is for register name. */ int size = __sw_hweight64(attr->sample_regs_intr) * 28; - char bf[size]; + char *bf = malloc(size); regs_map(&sample->intr_regs, attr->sample_regs_intr, arch, bf, sizeof(bf)); @@ -775,6 +778,7 @@ static void set_regs_in_dict(PyObject *dict, pydict_set_item_string_decref(dict, "uregs", _PyUnicode_FromString(bf)); + free(bf); } static void set_sym_in_dict(PyObject *dict, struct addr_location *al, -- cgit v1.2.3 From 5cedd1e29d4513c54be2c681cffaad47058f8cc0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:12 -0700 Subject: perf jit: Fix two thread leaks As reported by leak sanitizer with reference count checking. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/jitdump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 2380b41a4caa..6b2b96c16ccd 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -800,6 +800,7 @@ static void jit_add_pid(struct machine *machine, pid_t pid) } thread__set_priv(thread, (void *)true); + thread__put(thread); } static bool jit_has_pid(struct machine *machine, pid_t pid) @@ -811,6 +812,7 @@ static bool jit_has_pid(struct machine *machine, pid_t pid) return false; priv = thread__priv(thread); + thread__put(thread); return (bool)priv; } -- cgit v1.2.3 From fe8fec1028dc382606a91c4bf27d3dd350c306bd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:13 -0700 Subject: perf symbol-elf: Correct holding a reference If a reference is held, don't put it as this will confuse reference count checking. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 63882a4db5c7..e6493d1cc251 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1389,11 +1389,11 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, /* Ensure maps are correctly ordered */ if (kmaps) { int err; + struct map *tmp = map__get(map); - map__get(map); maps__remove(kmaps, map); err = maps__insert(kmaps, map); - map__put(map); + map__put(tmp); if (err) return err; } -- cgit v1.2.3 From 814a656870eee89062b960c48c1fdd6064cd0bbf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:14 -0700 Subject: perf maps: Fix overlapping memory leak Add a missed free detected by leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 5206a6433117..233438c95b53 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -374,6 +374,7 @@ int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) } put_map: map__put(pos->map); + free(pos); } up_write(maps__lock(maps)); return err; -- cgit v1.2.3 From 34b29bd61d4e0385164d569f2dd8ffc3b4058ed6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:15 -0700 Subject: perf machine: Fix leak of kernel dso The kernel dso may be found by searching dsos or allocating if not found. The allocation returns with a reference count of 2, once for the dsos list and once for the returned value. The list search has a reference count of 1, once for the dsos list. To make the reference counts consistent, increase the dsos list search reference count to 2 with a dso__get, and do a put when the scope ends for either the allocated or found dso. This issue was found with leak sanitizer and reference count checking. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 46af5e9748c9..f8e6c07f0048 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1868,7 +1868,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine, continue; - kernel = dso; + kernel = dso__get(dso); break; } @@ -1913,6 +1913,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine, */ dso__load(kernel, machine__kernel_map(machine)); } + dso__put(kernel); } else if (perf_event__is_extra_kernel_mmap(machine, xm)) { return machine__process_extra_kernel_map(machine, xm); } -- cgit v1.2.3 From 1981da1fe2499823f626c86c5ba3be6b89844384 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:16 -0700 Subject: perf machine: Don't leak module maps machine__addnew_module_map requires a put on its result. Add this and narrow the scope of map to make the correctness more obvious. This leak was caught with leak sanitizer and the reference count checker. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-20-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f8e6c07f0048..359ef6b4e840 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1797,7 +1797,6 @@ static int machine__process_kernel_mmap_event(struct machine *machine, struct extra_kernel_map *xm, struct build_id *bid) { - struct map *map; enum dso_space_type dso_space; bool is_kernel_mmap; const char *mmap_name = machine->mmap_name; @@ -1823,8 +1822,8 @@ static int machine__process_kernel_mmap_event(struct machine *machine, } if (xm->name[0] == '/' || (!is_kernel_mmap && xm->name[0] == '[')) { - map = machine__addnew_module_map(machine, xm->start, - xm->name); + struct map *map = machine__addnew_module_map(machine, xm->start, xm->name); + if (map == NULL) goto out_problem; @@ -1833,6 +1832,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine, if (build_id__is_defined(bid)) dso__set_build_id(map__dso(map), bid); + map__put(map); } else if (is_kernel_mmap) { const char *symbol_name = xm->name + strlen(mmap_name); /* -- cgit v1.2.3 From bffb5b0c0976aa46aaa961dd19a47c9d6301cfe1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:17 -0700 Subject: perf map/maps/thread: Changes to reference counting Fix missed reference count gets and puts as detected with leak sanitizer and reference count checking. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-21-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 28 ++++++++++++++++++++++------ tools/perf/util/event.c | 3 +++ tools/perf/util/hist.c | 6 ++++-- tools/perf/util/machine.c | 29 +++++++++++++++++------------ 4 files changed, 46 insertions(+), 20 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index b0dafc758173..909f62b3b266 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -590,6 +590,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) call->ip = cursor_node->ip; call->ms = cursor_node->ms; call->ms.map = map__get(call->ms.map); + call->ms.maps = maps__get(call->ms.maps); call->srcline = cursor_node->srcline; if (cursor_node->branch) { @@ -649,6 +650,7 @@ add_child(struct callchain_node *parent, list_for_each_entry_safe(call, tmp, &new->val, list) { list_del_init(&call->list); map__zput(call->ms.map); + maps__zput(call->ms.maps); free(call); } free(new); @@ -1010,10 +1012,16 @@ merge_chain_branch(struct callchain_cursor *cursor, int err = 0; list_for_each_entry_safe(list, next_list, &src->val, list) { - callchain_cursor_append(cursor, list->ip, &list->ms, - false, NULL, 0, 0, 0, list->srcline); + struct map_symbol ms = { + .maps = maps__get(list->ms.maps), + .map = map__get(list->ms.map), + }; + callchain_cursor_append(cursor, list->ip, &ms, false, NULL, 0, 0, 0, list->srcline); list_del_init(&list->list); + map__zput(ms.map); + maps__zput(ms.maps); map__zput(list->ms.map); + maps__zput(list->ms.maps); free(list); } @@ -1065,9 +1073,11 @@ int callchain_cursor_append(struct callchain_cursor *cursor, } node->ip = ip; + maps__zput(node->ms.maps); map__zput(node->ms.map); node->ms = *ms; - node->ms.map = map__get(node->ms.map); + node->ms.maps = maps__get(ms->maps); + node->ms.map = map__get(ms->map); node->branch = branch; node->nr_loop_iter = nr_loop_iter; node->iter_cycles = iter_cycles; @@ -1114,7 +1124,8 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * { struct machine *machine = maps__machine(node->ms.maps); - al->maps = node->ms.maps; + maps__put(al->maps); + al->maps = maps__get(node->ms.maps); map__put(al->map); al->map = map__get(node->ms.map); al->sym = node->ms.sym; @@ -1127,7 +1138,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * if (al->map == NULL) goto out; } - if (al->maps == machine__kernel_maps(machine)) { + if (RC_CHK_ACCESS(al->maps) == RC_CHK_ACCESS(machine__kernel_maps(machine))) { if (machine__is_host(machine)) { al->cpumode = PERF_RECORD_MISC_KERNEL; al->level = 'k'; @@ -1460,12 +1471,14 @@ static void free_callchain_node(struct callchain_node *node) list_for_each_entry_safe(list, tmp, &node->parent_val, list) { list_del_init(&list->list); map__zput(list->ms.map); + maps__zput(list->ms.maps); free(list); } list_for_each_entry_safe(list, tmp, &node->val, list) { list_del_init(&list->list); map__zput(list->ms.map); + maps__zput(list->ms.maps); free(list); } @@ -1551,6 +1564,7 @@ out: list_for_each_entry_safe(chain, new, &head, list) { list_del_init(&chain->list); map__zput(chain->ms.map); + maps__zput(chain->ms.maps); free(chain); } return -ENOMEM; @@ -1596,8 +1610,10 @@ void callchain_cursor_reset(struct callchain_cursor *cursor) cursor->nr = 0; cursor->last = &cursor->first; - for (node = cursor->first; node != NULL; node = node->next) + for (node = cursor->first; node != NULL; node = node->next) { map__zput(node->ms.map); + maps__zput(node->ms.maps); + } } void callchain_param_setup(u64 sample_type, const char *arch) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 2fcfba38fc48..3860b0c74829 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -711,6 +711,9 @@ int machine__resolve(struct machine *machine, struct addr_location *al, if (thread__is_filtered(thread)) al->filtered |= (1 << HIST_FILTER__THREAD); + thread__put(thread); + thread = NULL; + al->sym = NULL; al->cpu = sample->cpu; al->socket = -1; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index dfda52d348a3..fb218b3e8a7c 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -450,6 +450,7 @@ static int hist_entry__init(struct hist_entry *he, memset(&he->stat, 0, sizeof(he->stat)); } + he->ms.maps = maps__get(he->ms.maps); he->ms.map = map__get(he->ms.map); if (he->branch_info) { @@ -497,7 +498,7 @@ static int hist_entry__init(struct hist_entry *he, } INIT_LIST_HEAD(&he->pairs.node); - thread__get(he->thread); + he->thread = thread__get(he->thread); he->hroot_in = RB_ROOT_CACHED; he->hroot_out = RB_ROOT_CACHED; @@ -523,6 +524,7 @@ err_infos: map__put(he->mem_info->daddr.ms.map); } err: + maps__zput(he->ms.maps); map__zput(he->ms.map); zfree(&he->stat_acc); return -ENOMEM; @@ -611,7 +613,6 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, * keys were used. */ cmp = hist_entry__cmp(he, entry); - if (!cmp) { if (sample_self) { he_stat__add_period(&he->stat, period); @@ -1309,6 +1310,7 @@ void hist_entry__delete(struct hist_entry *he) struct hist_entry_ops *ops = he->ops; thread__zput(he->thread); + maps__zput(he->ms.maps); map__zput(he->ms.map); if (he->branch_info) { diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 359ef6b4e840..bdad4b8bf77d 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -539,7 +539,7 @@ static void machine__update_thread_pid(struct machine *machine, goto out_err; if (thread__maps(th) == thread__maps(leader)) - return; + goto out_put; if (thread__maps(th)) { /* @@ -579,7 +579,7 @@ __threads__get_last_match(struct threads *threads, struct machine *machine, machine__update_thread_pid(machine, th, pid); return thread__get(th); } - + thread__put(threads->last_match); threads->last_match = NULL; } @@ -601,7 +601,8 @@ threads__get_last_match(struct threads *threads, struct machine *machine, static void __threads__set_last_match(struct threads *threads, struct thread *th) { - threads->last_match = th; + thread__put(threads->last_match); + threads->last_match = thread__get(th); } static void @@ -664,7 +665,6 @@ static struct thread *____machine__findnew_thread(struct machine *machine, rb_link_node(&nd->rb_node, parent, p); rb_insert_color_cached(&nd->rb_node, &threads->entries, leftmost); - /* * We have to initialize maps separately after rb tree is updated. * @@ -673,6 +673,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, * the rb tree. */ if (thread__init_maps(th, machine)) { + pr_err("Thread init failed thread %d\n", pid); rb_erase_cached(&nd->rb_node, &threads->entries); RB_CLEAR_NODE(&nd->rb_node); free(nd); @@ -682,11 +683,10 @@ static struct thread *____machine__findnew_thread(struct machine *machine, /* * It is now in the rbtree, get a ref */ - thread__get(th); threads__set_last_match(threads, th); ++threads->nr; - return th; + return thread__get(th); } struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid) @@ -2321,7 +2321,7 @@ static int add_callchain_ip(struct thread *thread, struct iterations *iter, u64 branch_from) { - struct map_symbol ms; + struct map_symbol ms = {}; struct addr_location al; int nr_loop_iter = 0, err = 0; u64 iter_cycles = 0; @@ -2395,6 +2395,8 @@ static int add_callchain_ip(struct thread *thread, iter_cycles, branch_from, srcline); out: addr_location__exit(&al); + maps__put(ms.maps); + map__put(ms.map); return err; } @@ -3089,6 +3091,7 @@ static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms struct dso *dso; u64 addr; int ret = 1; + struct map_symbol ilist_ms; if (!symbol_conf.inline_name || !map || !sym) return ret; @@ -3105,18 +3108,20 @@ static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms inlines__tree_insert(&dso->inlined_nodes, inline_node); } + ilist_ms = (struct map_symbol) { + .maps = maps__get(ms->maps), + .map = map__get(map), + }; list_for_each_entry(ilist, &inline_node->val, list) { - struct map_symbol ilist_ms = { - .maps = ms->maps, - .map = map, - .sym = ilist->symbol, - }; + ilist_ms.sym = ilist->symbol; ret = callchain_cursor_append(cursor, ip, &ilist_ms, false, NULL, 0, 0, 0, ilist->srcline); if (ret != 0) return ret; } + map__put(ilist_ms.map); + maps__put(ilist_ms.maps); return ret; } -- cgit v1.2.3 From d3d53b2e9617ea606aae91a013163895f037de96 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:18 -0700 Subject: perf annotate: Fix parse_objdump_line memory leak fileloc is used to hold a previous line, before overwriting it ensure the previous contents is freed. Free the storage once done in symbol__disassemble. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 7f05f2a2aa83..57ef616cdbfd 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1546,6 +1546,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, /* /filename:linenr ? Save line number and ignore. */ if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { *line_nr = atoi(parsed_line + match[1].rm_so); + free(*fileloc); *fileloc = strdup(parsed_line); return 0; } @@ -1594,7 +1595,6 @@ static int symbol__parse_objdump_line(struct symbol *sym, } annotation_line__add(&dl->al, ¬es->src->source); - return 0; } @@ -2136,6 +2136,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) nline++; } free(line); + free(fileloc); err = finish_command(&objdump_process); if (err) -- cgit v1.2.3 From cddeeeda8fba4156255abf5a1d8c2517de8db0cd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:19 -0700 Subject: perf top: Add exit routine for main thread Add exit_process_thread that reverses init_process_thread. This avoids leak sanitizer reporting memory leaks. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-23-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 99010dfa5760..c363c04e16df 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -392,7 +392,7 @@ static void prompt_percent(int *target, const char *msg) static void perf_top__prompt_symbol(struct perf_top *top, const char *msg) { - char *buf = malloc(0), *p; + char *buf = NULL, *p; struct hist_entry *syme = top->sym_filter_entry, *n, *found = NULL; struct hists *hists = evsel__hists(top->sym_evsel); struct rb_node *next; @@ -1227,6 +1227,14 @@ static void init_process_thread(struct perf_top *top) cond_init(&top->qe.cond); } +static void exit_process_thread(struct perf_top *top) +{ + ordered_events__free(&top->qe.data[0]); + ordered_events__free(&top->qe.data[1]); + mutex_destroy(&top->qe.mutex); + cond_destroy(&top->qe.cond); +} + static int __cmd_top(struct perf_top *top) { struct record_opts *opts = &top->record_opts; @@ -1357,6 +1365,7 @@ out_join_thread: cond_signal(&top->qe.cond); pthread_join(thread_process, NULL); perf_set_singlethreaded(); + exit_process_thread(top); return ret; } -- cgit v1.2.3 From d7ba60a4e590f79e6f28c0fb47d4a862656b1d70 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:20 -0700 Subject: perf header: Avoid out-of-bounds read intel-pt tests were failing: -- Test virtual LBR --- Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.126 MB /tmp/perf-test-intel-pt-sh.FW57CXnCqQ/test-perf.data ] Failed with virtual lbr ... ``` The root cause is an out-of-bounds read in header (where maxbrstack.py is from test_intel_pt.sh): ``` $ perf --no-pager script --itrace=L -s maxbrstack.py ================================================================= ==3907930==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x6020000095a8 at pc 0x563c26c840bb bp 0x7fff43582710 sp 0x7fff43582708 READ of size 4 at 0x6020000095a8 thread T0 #0 0x563c26c840ba in process_group_desc util/header.c:2847 #1 0x563c26c8bc78 in perf_file_section__process util/header.c:4037 #2 0x563c26c8aa9b in perf_header__process_sections util/header.c:3813 #3 0x563c26c8d028 in perf_session__read_header util/header.c:4286 #4 0x563c26cbab29 in perf_session__open util/session.c:113 #5 0x563c26cbb3d0 in __perf_session__new util/session.c:221 #6 0x563c26aacb14 in perf_session__new util/session.h:73 #7 0x563c26acf7f1 in cmd_script tools/perf/builtin-script.c:4212 #8 0x563c26bb58ff in run_builtin tools/perf/perf.c:323 #9 0x563c26bb5e70 in handle_internal_command tools/perf/perf.c:377 #10 0x563c26bb6238 in run_argv tools/perf/perf.c:421 #11 0x563c26bb67a0 in main tools/perf/perf.c:537 #12 0x7f34bde46189 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 #13 0x7f34bde46244 in __libc_start_main_impl ../csu/libc-start.c:381 #14 0x563c26a33390 in _start (/tmp/perf/perf+0x1eb390) 0x6020000095a8 is located 8 bytes to the right of 16-byte region [0x602000009590,0x6020000095a0) allocated by thread T0 here: #0 0x7f34beeb83b7 in __interceptor_calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:77 #1 0x563c26c83df8 in process_group_desc util/header.c:2824 #2 0x563c26c8bc78 in perf_file_section__process util/header.c:4037 #3 0x563c26c8aa9b in perf_header__process_sections util/header.c:3813 #4 0x563c26c8d028 in perf_session__read_header util/header.c:4286 #5 0x563c26cbab29 in perf_session__open util/session.c:113 #6 0x563c26cbb3d0 in __perf_session__new util/session.c:221 #7 0x563c26aacb14 in perf_session__new util/session.h:73 #8 0x563c26acf7f1 in cmd_script tools/perf/builtin-script.c:4212 #9 0x563c26bb58ff in run_builtin tools/perf/perf.c:323 #10 0x563c26bb5e70 in handle_internal_command tools/perf/perf.c:377 #11 0x563c26bb6238 in run_argv tools/perf/perf.c:421 #12 0x563c26bb67a0 in main tools/perf/perf.c:537 #13 0x7f34bde46189 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 ``` Avoid the out-of-bounds read checking for the leader. Leave the 'nr' check intact as nr will be 0 or the counting down and evsel be a group member. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ian Rogers Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/lkml/20230608232823.4027869-24-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3db7c1fae71e..52fbf526fe74 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2844,7 +2844,7 @@ static int process_group_desc(struct feat_fd *ff, void *data __maybe_unused) i = nr = 0; evlist__for_each_entry(session->evlist, evsel) { - if (evsel->core.idx == (int) desc[i].leader_idx) { + if (i < nr_groups && evsel->core.idx == (int) desc[i].leader_idx) { evsel__set_leader(evsel, evsel); /* {anon_group} is a dummy name */ if (strcmp(desc[i].name, "{anon_group}")) { -- cgit v1.2.3 From 8ab12a2038e36beda4062a8e7562a8cfe9655553 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:21 -0700 Subject: perf callchain: Use pthread keys for tls callchain_cursor Pthread keys are more portable than __thread and allow the association of a destructor with the key. Use the destructor to clean up TLS callchain cursors to aid understanding memory leaks. Committer notes: Had to fixup a series of unconverted places and also check for the return of get_tls_callchain_cursor() as it may fail and return NULL. In that unlikely case we now either print something to a file, if the caller was expecting to print a callchain, or return an error code to state that resolving the callchain isn't possible. In some cases this was made easier because thread__resolve_callchain() already can fail for other reasons, so this new one (cursor == NULL) can be added and the callers don't have to explicitely check for this new condition. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-25-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 4 +- tools/perf/builtin-kmem.c | 14 +++++-- tools/perf/builtin-kwork.c | 12 +++++- tools/perf/builtin-lock.c | 7 +++- tools/perf/builtin-sched.c | 14 +++++-- tools/perf/builtin-script.c | 24 ++++++----- tools/perf/builtin-trace.c | 20 ++++++---- tools/perf/util/callchain.c | 45 ++++++++++++++++++++- tools/perf/util/callchain.h | 8 ++-- tools/perf/util/db-export.c | 10 +++-- tools/perf/util/evsel_fprintf.c | 3 ++ tools/perf/util/hist.c | 46 +++++++++++++++------- tools/perf/util/machine.c | 3 ++ .../perf/util/scripting-engines/trace-event-perl.c | 11 ++++-- .../util/scripting-engines/trace-event-python.c | 10 +++-- 15 files changed, 170 insertions(+), 61 deletions(-) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 530a44a59f41..a4cf9de7a7b5 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -284,6 +284,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, struct hist_entry *he; struct addr_location al; struct mem_info *mi, *mi_dup; + struct callchain_cursor *cursor; int ret; addr_location__init(&al); @@ -297,7 +298,8 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, if (c2c.stitch_lbr) thread__set_lbr_stitch_enable(al.thread, true); - ret = sample__resolve_callchain(sample, &callchain_cursor, NULL, + cursor = get_tls_callchain_cursor(); + ret = sample__resolve_callchain(sample, cursor, NULL, evsel, &al, sysctl_perf_event_max_stack); if (ret) goto out; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 96a6611e4e53..9714327fd0ea 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -399,6 +399,7 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample) struct addr_location al; struct machine *machine = &kmem_session->machines.host; struct callchain_cursor_node *node; + struct callchain_cursor *cursor; u64 result = sample->ip; addr_location__init(&al); @@ -408,14 +409,19 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample) } al.thread = machine__findnew_thread(machine, sample->pid, sample->tid); - sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16); - callchain_cursor_commit(&callchain_cursor); + cursor = get_tls_callchain_cursor(); + if (cursor == NULL) + goto out; + + sample__resolve_callchain(sample, cursor, NULL, evsel, &al, 16); + + callchain_cursor_commit(cursor); while (true) { struct alloc_func key, *caller; u64 addr; - node = callchain_cursor_current(&callchain_cursor); + node = callchain_cursor_current(cursor); if (node == NULL) break; @@ -434,7 +440,7 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample) } else pr_debug3("skipping alloc function: %s\n", caller->name); - callchain_cursor_advance(&callchain_cursor); + callchain_cursor_advance(cursor); } pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip); diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index 2d80aef4eccc..14bf7a8429e7 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -589,7 +589,7 @@ static void timehist_save_callchain(struct perf_kwork *kwork, struct symbol *sym; struct thread *thread; struct callchain_cursor_node *node; - struct callchain_cursor *cursor = &callchain_cursor; + struct callchain_cursor *cursor; if (!kwork->show_callchain || sample->callchain == NULL) return; @@ -601,6 +601,8 @@ static void timehist_save_callchain(struct perf_kwork *kwork, return; } + cursor = get_tls_callchain_cursor(); + if (thread__resolve_callchain(thread, cursor, evsel, sample, NULL, NULL, kwork->max_stack + 2) != 0) { pr_debug("Failed to resolve callchain, skipping\n"); @@ -686,12 +688,18 @@ static void timehist_print_event(struct perf_kwork *kwork, * callchain */ if (kwork->show_callchain) { + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + if (cursor == NULL) + return; + printf(" "); + sample__fprintf_sym(sample, al, 0, EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE | EVSEL__PRINT_CALLCHAIN_ARROW | EVSEL__PRINT_SKIP_IGNORED, - &callchain_cursor, symbol_conf.bt_stop_list, + cursor, symbol_conf.bt_stop_list, stdout); } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fc8356bd6e3a..8b505e1e5002 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -911,7 +911,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl char *buf, int size) { struct thread *thread; - struct callchain_cursor *cursor = &callchain_cursor; + struct callchain_cursor *cursor; struct machine *machine = &session->machines.host; struct symbol *sym; int skip = 0; @@ -925,6 +925,8 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl if (thread == NULL) return -1; + cursor = get_tls_callchain_cursor(); + /* use caller function name from the callchain */ ret = thread__resolve_callchain(thread, cursor, evsel, sample, NULL, NULL, max_stack_depth); @@ -962,7 +964,7 @@ next: static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample) { - struct callchain_cursor *cursor = &callchain_cursor; + struct callchain_cursor *cursor; struct machine *machine = &session->machines.host; struct thread *thread; u64 hash = 0; @@ -973,6 +975,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample) if (thread == NULL) return -1; + cursor = get_tls_callchain_cursor(); /* use caller function name from the callchain */ ret = thread__resolve_callchain(thread, cursor, evsel, sample, NULL, NULL, max_stack_depth); diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index cd79068200e5..c9ddf73689cd 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2111,7 +2111,7 @@ static void timehist_print_sample(struct perf_sched *sched, EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE | EVSEL__PRINT_CALLCHAIN_ARROW | EVSEL__PRINT_SKIP_IGNORED, - &callchain_cursor, symbol_conf.bt_stop_list, stdout); + get_tls_callchain_cursor(), symbol_conf.bt_stop_list, stdout); out: printf("\n"); @@ -2196,7 +2196,7 @@ static void save_task_callchain(struct perf_sched *sched, struct evsel *evsel, struct machine *machine) { - struct callchain_cursor *cursor = &callchain_cursor; + struct callchain_cursor *cursor; struct thread *thread; /* want main thread for process - has maps */ @@ -2209,6 +2209,8 @@ static void save_task_callchain(struct perf_sched *sched, if (!sched->show_callchain || sample->callchain == NULL) return; + cursor = get_tls_callchain_cursor(); + if (thread__resolve_callchain(thread, cursor, evsel, sample, NULL, NULL, sched->max_stack + 2) != 0) { if (verbose > 0) @@ -2338,10 +2340,16 @@ static void save_idle_callchain(struct perf_sched *sched, struct idle_thread_runtime *itr, struct perf_sample *sample) { + struct callchain_cursor *cursor; + if (!sched->show_callchain || sample->callchain == NULL) return; - callchain_cursor__copy(&itr->cursor, &callchain_cursor); + cursor = get_tls_callchain_cursor(); + if (cursor == NULL) + return; + + callchain_cursor__copy(&itr->cursor, cursor); } static struct thread *timehist_get_thread(struct perf_sched *sched, diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 784d478c2e05..e3f435e6a7d0 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1557,11 +1557,13 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample, unsigned int print_opts = output[type].print_ip_opts; struct callchain_cursor *cursor = NULL; - if (symbol_conf.use_callchain && sample->callchain && - thread__resolve_callchain(al->thread, &callchain_cursor, evsel, - sample, NULL, NULL, scripting_max_stack) == 0) - cursor = &callchain_cursor; - + if (symbol_conf.use_callchain && sample->callchain) { + cursor = get_tls_callchain_cursor(); + if (thread__resolve_callchain(al->thread, cursor, evsel, + sample, NULL, NULL, + scripting_max_stack)) + cursor = NULL; + } if (cursor == NULL) { printed += fprintf(fp, " "); if (print_opts & EVSEL__PRINT_SRCLINE) { @@ -2203,11 +2205,13 @@ static void process_event(struct perf_script *script, if (script->stitch_lbr) thread__set_lbr_stitch_enable(al->thread, true); - if (symbol_conf.use_callchain && sample->callchain && - thread__resolve_callchain(al->thread, &callchain_cursor, evsel, - sample, NULL, NULL, scripting_max_stack) == 0) - cursor = &callchain_cursor; - + if (symbol_conf.use_callchain && sample->callchain) { + cursor = get_tls_callchain_cursor(); + if (thread__resolve_callchain(al->thread, cursor, evsel, + sample, NULL, NULL, + scripting_max_stack)) + cursor = NULL; + } fputc(cursor ? '\n' : ' ', fp); sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, symbol_conf.bt_stop_list, fp); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 6a1e75f06832..6e73d0e95715 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2437,7 +2437,7 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam EVSEL__PRINT_DSO | EVSEL__PRINT_UNKNOWN_AS_ADDR; - return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output); + return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output); } static const char *errno_to_name(struct evsel *evsel, int err) @@ -2491,9 +2491,11 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, goto out; if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out; callchain_ret = 1; } @@ -2795,9 +2797,11 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out; callchain_ret = 1; } @@ -2899,9 +2903,11 @@ static int trace__pgfault(struct trace *trace, thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out_put; callchain_ret = 1; } diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 909f62b3b266..aee937d14fbb 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -58,7 +58,8 @@ struct callchain_param callchain_param_default = { CALLCHAIN_PARAM_DEFAULT }; -__thread struct callchain_cursor callchain_cursor; +/* Used for thread-local struct callchain_cursor. */ +static pthread_key_t callchain_cursor; int parse_callchain_record_opt(const char *arg, struct callchain_param *param) { @@ -986,6 +987,9 @@ int callchain_append(struct callchain_root *root, struct callchain_cursor *cursor, u64 period) { + if (cursor == NULL) + return -1; + if (!cursor->nr) return 0; @@ -1116,7 +1120,7 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp if ((!symbol_conf.use_callchain || sample->callchain == NULL) && !symbol_conf.show_branchflag_count) return 0; - return callchain_append(he->callchain, &callchain_cursor, sample->period); + return callchain_append(he->callchain, get_tls_callchain_cursor(), sample->period); } int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node, @@ -1570,6 +1574,43 @@ out: return -ENOMEM; } +static void callchain_cursor__delete(void *vcursor) +{ + struct callchain_cursor *cursor = vcursor; + struct callchain_cursor_node *node, *next; + + callchain_cursor_reset(cursor); + for (node = cursor->first; node != NULL; node = next) { + next = node->next; + free(node); + } + free(cursor); +} + +static void init_callchain_cursor_key(void) +{ + if (pthread_key_create(&callchain_cursor, callchain_cursor__delete)) { + pr_err("callchain cursor creation failed"); + abort(); + } +} + +struct callchain_cursor *get_tls_callchain_cursor(void) +{ + static pthread_once_t once_control = PTHREAD_ONCE_INIT; + struct callchain_cursor *cursor; + + pthread_once(&once_control, init_callchain_cursor_key); + cursor = pthread_getspecific(callchain_cursor); + if (!cursor) { + cursor = zalloc(sizeof(*cursor)); + if (!cursor) + pr_debug3("%s: not enough memory\n", __func__); + pthread_setspecific(callchain_cursor, cursor); + } + return cursor; +} + int callchain_cursor__copy(struct callchain_cursor *dst, struct callchain_cursor *src) { diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index d95615daed73..d2618a47deca 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -168,8 +168,6 @@ struct callchain_cursor { struct callchain_cursor_node *curr; }; -extern __thread struct callchain_cursor callchain_cursor; - static inline void callchain_init(struct callchain_root *root) { INIT_LIST_HEAD(&root->node.val); @@ -211,6 +209,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, /* Close a cursor writing session. Initialize for the reader */ static inline void callchain_cursor_commit(struct callchain_cursor *cursor) { + if (cursor == NULL) + return; cursor->curr = cursor->first; cursor->pos = 0; } @@ -219,7 +219,7 @@ static inline void callchain_cursor_commit(struct callchain_cursor *cursor) static inline struct callchain_cursor_node * callchain_cursor_current(struct callchain_cursor *cursor) { - if (cursor->pos == cursor->nr) + if (cursor == NULL || cursor->pos == cursor->nr) return NULL; return cursor->curr; @@ -231,6 +231,8 @@ static inline void callchain_cursor_advance(struct callchain_cursor *cursor) cursor->pos++; } +struct callchain_cursor *get_tls_callchain_cursor(void); + int callchain_cursor__copy(struct callchain_cursor *dst, struct callchain_cursor *src); diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 6184696dc266..b9fb71ab7a73 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -215,6 +215,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, u64 kernel_start = machine__kernel_start(machine); struct call_path *current = &dbe->cpr->call_path; enum chain_order saved_order = callchain_param.order; + struct callchain_cursor *cursor; int err; if (!symbol_conf.use_callchain || !sample->callchain) @@ -226,13 +227,14 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, * the callchain starting with the root node and ending with the leaf. */ callchain_param.order = ORDER_CALLER; - err = thread__resolve_callchain(thread, &callchain_cursor, evsel, + cursor = get_tls_callchain_cursor(); + err = thread__resolve_callchain(thread, cursor, evsel, sample, NULL, NULL, PERF_MAX_STACK_DEPTH); if (err) { callchain_param.order = saved_order; return NULL; } - callchain_cursor_commit(&callchain_cursor); + callchain_cursor_commit(cursor); while (1) { struct callchain_cursor_node *node; @@ -240,7 +242,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, u64 dso_db_id = 0, sym_db_id = 0, offset = 0; - node = callchain_cursor_current(&callchain_cursor); + node = callchain_cursor_current(cursor); if (!node) break; @@ -265,7 +267,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, al.sym, node->ip, kernel_start); - callchain_cursor_advance(&callchain_cursor); + callchain_cursor_advance(cursor); addr_location__exit(&al); } diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index cf45ca0e768f..8719b3cb5646 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -127,6 +127,9 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, char s = print_oneline ? ' ' : '\t'; bool first = true; + if (cursor == NULL) + return fprintf(fp, "%s", print_oneline ? "" : "\n"); + if (sample->callchain) { callchain_cursor_commit(cursor); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index fb218b3e8a7c..efaf7ac784fc 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1029,15 +1029,19 @@ iter_prepare_cumulative_entry(struct hist_entry_iter *iter, struct addr_location *al __maybe_unused) { struct hist_entry **he_cache; + struct callchain_cursor *cursor = get_tls_callchain_cursor(); - callchain_cursor_commit(&callchain_cursor); + if (cursor == NULL) + return -ENOMEM; + + callchain_cursor_commit(cursor); /* * This is for detecting cycles or recursions so that they're * cumulated only one time to prevent entries more than 100% * overhead. */ - he_cache = malloc(sizeof(*he_cache) * (callchain_cursor.nr + 1)); + he_cache = malloc(sizeof(*he_cache) * (cursor->nr + 1)); if (he_cache == NULL) return -ENOMEM; @@ -1072,7 +1076,7 @@ iter_add_single_cumulative_entry(struct hist_entry_iter *iter, * We need to re-initialize the cursor since callchain_append() * advanced the cursor to the end. */ - callchain_cursor_commit(&callchain_cursor); + callchain_cursor_commit(get_tls_callchain_cursor()); hists__inc_nr_samples(hists, he->filtered); @@ -1085,7 +1089,7 @@ iter_next_cumulative_entry(struct hist_entry_iter *iter, { struct callchain_cursor_node *node; - node = callchain_cursor_current(&callchain_cursor); + node = callchain_cursor_current(get_tls_callchain_cursor()); if (node == NULL) return 0; @@ -1131,12 +1135,15 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, .raw_size = sample->raw_size, }; int i; - struct callchain_cursor cursor; + struct callchain_cursor cursor, *tls_cursor = get_tls_callchain_cursor(); bool fast = hists__has(he_tmp.hists, sym); - callchain_cursor_snapshot(&cursor, &callchain_cursor); + if (tls_cursor == NULL) + return -ENOMEM; + + callchain_cursor_snapshot(&cursor, tls_cursor); - callchain_cursor_advance(&callchain_cursor); + callchain_cursor_advance(tls_cursor); /* * Check if there's duplicate entries in the callchain. @@ -1222,7 +1229,7 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al, if (al) alm = map__get(al->map); - err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent, + err = sample__resolve_callchain(iter->sample, get_tls_callchain_cursor(), &iter->parent, iter->evsel, al, max_stack_depth); if (err) { map__put(alm); @@ -1568,8 +1575,13 @@ static int hists__hierarchy_insert_entry(struct hists *hists, if (hist_entry__has_callchains(new_he) && symbol_conf.use_callchain) { - callchain_cursor_reset(&callchain_cursor); - if (callchain_merge(&callchain_cursor, + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + if (cursor == NULL) + return -1; + + callchain_cursor_reset(cursor); + if (callchain_merge(cursor, new_he->callchain, he->callchain) < 0) ret = -1; @@ -1610,11 +1622,15 @@ static int hists__collapse_insert_entry(struct hists *hists, he_stat__add_stat(iter->stat_acc, he->stat_acc); if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) { - callchain_cursor_reset(&callchain_cursor); - if (callchain_merge(&callchain_cursor, - iter->callchain, - he->callchain) < 0) - ret = -1; + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + if (cursor != NULL) { + callchain_cursor_reset(cursor); + if (callchain_merge(cursor, iter->callchain, he->callchain) < 0) + ret = -1; + } else { + ret = 0; + } } hist_entry__delete(he); return ret; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index bdad4b8bf77d..4e62843d51b7 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -3180,6 +3180,9 @@ int thread__resolve_callchain(struct thread *thread, { int ret = 0; + if (cursor == NULL) + return -ENOMEM; + callchain_cursor_reset(cursor); if (callchain_param.order == ORDER_CALLEE) { diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 65b761d83a1f..603091317bed 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -260,6 +260,7 @@ static SV *perl_process_callchain(struct perf_sample *sample, struct evsel *evsel, struct addr_location *al) { + struct callchain_cursor *cursor; AV *list; list = newAV(); @@ -269,18 +270,20 @@ static SV *perl_process_callchain(struct perf_sample *sample, if (!symbol_conf.use_callchain || !sample->callchain) goto exit; - if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, + cursor = get_tls_callchain_cursor(); + + if (thread__resolve_callchain(al->thread, cursor, evsel, sample, NULL, NULL, scripting_max_stack) != 0) { pr_err("Failed to resolve callchain. Skipping\n"); goto exit; } - callchain_cursor_commit(&callchain_cursor); + callchain_cursor_commit(cursor); while (1) { HV *elem; struct callchain_cursor_node *node; - node = callchain_cursor_current(&callchain_cursor); + node = callchain_cursor_current(cursor); if (!node) break; @@ -328,7 +331,7 @@ static SV *perl_process_callchain(struct perf_sample *sample, } } - callchain_cursor_advance(&callchain_cursor); + callchain_cursor_advance(cursor); av_push(list, newRV_noinc((SV*)elem)); } diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index d96e5c0fef45..59063ec98619 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -417,6 +417,7 @@ static PyObject *python_process_callchain(struct perf_sample *sample, struct addr_location *al) { PyObject *pylist; + struct callchain_cursor *cursor; pylist = PyList_New(0); if (!pylist) @@ -425,19 +426,20 @@ static PyObject *python_process_callchain(struct perf_sample *sample, if (!symbol_conf.use_callchain || !sample->callchain) goto exit; - if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, + cursor = get_tls_callchain_cursor(); + if (thread__resolve_callchain(al->thread, cursor, evsel, sample, NULL, NULL, scripting_max_stack) != 0) { pr_err("Failed to resolve callchain. Skipping\n"); goto exit; } - callchain_cursor_commit(&callchain_cursor); + callchain_cursor_commit(cursor); while (1) { PyObject *pyelem; struct callchain_cursor_node *node; - node = callchain_cursor_current(&callchain_cursor); + node = callchain_cursor_current(cursor); if (!node) break; @@ -493,7 +495,7 @@ static PyObject *python_process_callchain(struct perf_sample *sample, _PyUnicode_FromString(dsoname)); } - callchain_cursor_advance(&callchain_cursor); + callchain_cursor_advance(cursor); PyList_Append(pylist, pyelem); Py_DECREF(pyelem); } -- cgit v1.2.3 From 625db36e6c53b39c664b7fcb509207d26ac58ea6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:22 -0700 Subject: perf srcline: Change free_srcline to zfree_srcline Make use after free more unlikely. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-26-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-diff.c | 4 ++-- tools/perf/util/annotate.c | 2 +- tools/perf/util/block-info.c | 4 ++-- tools/perf/util/hist.c | 6 +++--- tools/perf/util/map.c | 2 +- tools/perf/util/srcline.c | 15 ++++++++++----- tools/perf/util/srcline.h | 2 +- 7 files changed, 20 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index ca39657ee407..eec89567ae48 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1387,8 +1387,8 @@ static int cycles_printf(struct hist_entry *he, struct hist_entry *pair, bi->start, bi->end, block_he->diff.cycles); } - free_srcline(start_line); - free_srcline(end_line); + zfree_srcline(&start_line); + zfree_srcline(&end_line); return scnprintf(hpp->buf, hpp->size, "%*s", width, buf); } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 57ef616cdbfd..bde890cfa620 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1218,7 +1218,7 @@ static void annotation_line__init(struct annotation_line *al, static void annotation_line__exit(struct annotation_line *al) { - free_srcline(al->path); + zfree_srcline(&al->path); zfree(&al->line); } diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 16a7b4adcf18..08279b1b65e5 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -305,8 +305,8 @@ static int block_range_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, bi->start, bi->end); } - free_srcline(start_line); - free_srcline(end_line); + zfree_srcline(&start_line); + zfree_srcline(&end_line); return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf); } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index efaf7ac784fc..be2c134d672f 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1323,8 +1323,8 @@ void hist_entry__delete(struct hist_entry *he) if (he->branch_info) { map__zput(he->branch_info->from.ms.map); map__zput(he->branch_info->to.ms.map); - free_srcline(he->branch_info->srcline_from); - free_srcline(he->branch_info->srcline_to); + zfree_srcline(&he->branch_info->srcline_from); + zfree_srcline(&he->branch_info->srcline_to); zfree(&he->branch_info); } @@ -1342,7 +1342,7 @@ void hist_entry__delete(struct hist_entry *he) zfree(&he->res_samples); zfree(&he->stat_acc); - free_srcline(he->srcline); + zfree_srcline(&he->srcline); if (he->srcfile && he->srcfile[0]) zfree(&he->srcfile); free_callchain(he->callchain); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index ae1d54d4880a..c77e2fce6a37 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -498,7 +498,7 @@ int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix, char *srcline = map__srcline(map, addr, NULL); if (strncmp(srcline, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) ret = fprintf(fp, "%s%s", prefix, srcline); - free_srcline(srcline); + zfree_srcline(&srcline); } return ret; } diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index cfca03abd6f8..b8e596528d7e 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -804,10 +804,15 @@ out: return NULL; } -void free_srcline(char *srcline) +void zfree_srcline(char **srcline) { - if (srcline && strcmp(srcline, SRCLINE_UNKNOWN) != 0) - free(srcline); + if (*srcline == NULL) + return; + + if (strcmp(*srcline, SRCLINE_UNKNOWN)) + free(*srcline); + + *srcline = NULL; } char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, @@ -880,7 +885,7 @@ void srcline__tree_delete(struct rb_root_cached *tree) pos = rb_entry(next, struct srcline_node, rb_node); next = rb_next(&pos->rb_node); rb_erase_cached(&pos->rb_node, tree); - free_srcline(pos->srcline); + zfree_srcline(&pos->srcline); zfree(&pos); } } @@ -903,7 +908,7 @@ void inline_node__delete(struct inline_node *node) list_for_each_entry_safe(ilist, tmp, &node->val, list) { list_del_init(&ilist->list); - free_srcline(ilist->srcline); + zfree_srcline(&ilist->srcline); /* only the inlined symbols are owned by the list */ if (ilist->symbol && ilist->symbol->inlined) symbol__delete(ilist->symbol); diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index b11a0aaaa676..a15c7db9058e 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -15,7 +15,7 @@ char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, bool show_sym, bool show_addr, bool unwind_inlines, u64 ip); -void free_srcline(char *srcline); +void zfree_srcline(char **srcline); char *get_srcline_split(struct dso *dso, u64 addr, unsigned *line); /* insert the srcline into the DSO, which will take ownership */ -- cgit v1.2.3 From 834631ee770aebd05fd25eaa5a4a2d0dcd65f3c5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:23 -0700 Subject: perf hist: Fix srcline memory leak srcline isn't freed if it is SRCLINE_UNKNOWN. Avoid strduping in this case as such strdups are redundant and leak memory. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-27-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index be2c134d672f..0a10bcc6ec95 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -484,7 +484,7 @@ static int hist_entry__init(struct hist_entry *he, goto err_infos; } - if (he->srcline) { + if (he->srcline && strcmp(he->srcline, SRCLINE_UNKNOWN)) { he->srcline = strdup(he->srcline); if (he->srcline == NULL) goto err_rawdata; -- cgit v1.2.3 From 922db21d7e094c363313f9787acdd47d774651af Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 Jun 2023 11:10:46 -0300 Subject: perf srcline: Optimize comparision against SRCLINE_UNKNOWN This is a string constant that gets returned and then strcmp() around, we can instead just do a pointer comparision. That requires a new global variable to comply with these warnings from some versions of clang and gcc: 41 68.95 fedora:rawhide : FAIL clang version 16.0.4 (Fedora 16.0.4-1.fc39) result of comparison against a string literal is unspecified (use an explicit string comparison function instead) [-Werror,-Wstring-compare] if (start_line != SRCLINE_UNKNOWN && ^ ~~~~~~~~~~~~~~~ 41 Ack comments: Agreed, the strcmps make me nervous as they won't distinguish heap from a global meaning we could end up with things like pointers to freed memory. The comparison with the global is always going to be same imo. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Link: https://lore.kernel.org/lkml/ZIcoJytUEz4UgQYR@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-diff.c | 4 ++-- tools/perf/util/block-info.c | 4 ++-- tools/perf/util/hist.c | 2 +- tools/perf/util/map.c | 2 +- tools/perf/util/sort.c | 2 +- tools/perf/util/srcline.c | 4 +++- tools/perf/util/srcline.h | 3 ++- 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index eec89567ae48..e8a1b16aa5f8 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1378,8 +1378,8 @@ static int cycles_printf(struct hist_entry *he, struct hist_entry *pair, end_line = map__srcline(he->ms.map, bi->sym->start + bi->end, he->ms.sym); - if ((strncmp(start_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) && - (strncmp(end_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0)) { + if (start_line != SRCLINE_UNKNOWN && + end_line != SRCLINE_UNKNOWN) { scnprintf(buf, sizeof(buf), "[%s -> %s] %4ld", start_line, end_line, block_he->diff.cycles); } else { diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 08279b1b65e5..591fc1edd385 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -296,8 +296,8 @@ static int block_range_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, end_line = map__srcline(he->ms.map, bi->sym->start + bi->end, he->ms.sym); - if ((strncmp(start_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) && - (strncmp(end_line, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0)) { + if (start_line != SRCLINE_UNKNOWN && + end_line != SRCLINE_UNKNOWN) { scnprintf(buf, sizeof(buf), "[%s -> %s]", start_line, end_line); } else { diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 0a10bcc6ec95..3dc8a4968beb 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -484,7 +484,7 @@ static int hist_entry__init(struct hist_entry *he, goto err_infos; } - if (he->srcline && strcmp(he->srcline, SRCLINE_UNKNOWN)) { + if (he->srcline && he->srcline != SRCLINE_UNKNOWN) { he->srcline = strdup(he->srcline); if (he->srcline == NULL) goto err_rawdata; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index c77e2fce6a37..f30d34903aa4 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -496,7 +496,7 @@ int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix, if (dso) { char *srcline = map__srcline(map, addr, NULL); - if (strncmp(srcline, SRCLINE_UNKNOWN, strlen(SRCLINE_UNKNOWN)) != 0) + if (srcline != SRCLINE_UNKNOWN) ret = fprintf(fp, "%s%s", prefix, srcline); zfree_srcline(&srcline); } diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 047c3606802f..6aa1c7f2b444 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -643,7 +643,7 @@ static char *hist_entry__get_srcfile(struct hist_entry *e) sf = __get_srcline(map__dso(map), map__rip_2objdump(map, e->ip), e->ms.sym, false, true, true, e->ip); - if (!strcmp(sf, SRCLINE_UNKNOWN)) + if (sf == SRCLINE_UNKNOWN) return no_srcfile; p = strchr(sf, ':'); if (p && *sf) { diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index b8e596528d7e..aec596a0b0bb 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -23,6 +23,8 @@ bool srcline_full_filename; +char *srcline__unknown = (char *)"??:0"; + static const char *dso__name(struct dso *dso) { const char *dso_name; @@ -809,7 +811,7 @@ void zfree_srcline(char **srcline) if (*srcline == NULL) return; - if (strcmp(*srcline, SRCLINE_UNKNOWN)) + if (*srcline != SRCLINE_UNKNOWN) free(*srcline); *srcline = NULL; diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index a15c7db9058e..167645bcff07 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -25,7 +25,8 @@ char *srcline__tree_find(struct rb_root_cached *tree, u64 addr); /* delete all srclines within the tree */ void srcline__tree_delete(struct rb_root_cached *tree); -#define SRCLINE_UNKNOWN ((char *) "??:0") +extern char *srcline__unknown; +#define SRCLINE_UNKNOWN srcline__unknown struct inline_list { struct symbol *symbol; -- cgit v1.2.3 From 0d98a7af4b12ae7ea78075240a66c21e5d3d9325 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 12 Jun 2023 16:04:24 +0100 Subject: perf map: Fix double 'struct map' reference free found with -DREFCNT_CHECKING=1 When quitting after running a 'perf report', the refcount checker finds some double frees. The issue is that map__put() is called on a function argument so it removes the refcount wrapper that someone else was using. Fix it by only calling map__put() on a reference that is owned by this function. Committer notes: Narrowed the map_ref scope as suggested by Ian, removed the symbol-elf part as it was already fixed by another patch, from Ian. Signed-off-by: James Clark Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230612150424.198914-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 6b9c55784b56..d275d3bef7d5 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1458,16 +1458,18 @@ static int dso__load_kcore(struct dso *dso, struct map *map, list_del_init(&new_node->node); if (RC_CHK_ACCESS(new_map) == RC_CHK_ACCESS(replacement_map)) { + struct map *map_ref; + map__set_start(map, map__start(new_map)); map__set_end(map, map__end(new_map)); map__set_pgoff(map, map__pgoff(new_map)); map__set_map_ip(map, map__map_ip_ptr(new_map)); map__set_unmap_ip(map, map__unmap_ip_ptr(new_map)); /* Ensure maps are correctly ordered */ - map__get(map); - maps__remove(kmaps, map); - err = maps__insert(kmaps, map); - map__put(map); + map_ref = map__get(map); + maps__remove(kmaps, map_ref); + err = maps__insert(kmaps, map_ref); + map__put(map_ref); map__put(new_map); if (err) goto out_err; -- cgit v1.2.3 From 951ccccdc7153120673fdc398878d629dcb7adf6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 12 Jun 2023 12:13:58 +0100 Subject: perf cs-etm: Only track threads instead of PID and TIDs PIDs and TIDs are already contained within the thread struct, so to avoid inconsistencies drop the extra members on the etm queue and only use the thread struct. At the same time stop using the 'unknown' thread. In a later commit we will be making samples from multiple machines so it will be better to use the idle thread of each machine rather than overlapping unknown threads. Using the idle thread is also better because kernel addresses with a previously unknown thread will now be assigned to a real kernel thread. Committer notes: Resolved conflicts with: perf addr_location: Add init/exit/copy functions perf thread: Add accessor functions for thread perf thread: Remove notion of dead threads That were present in tmp.perf-tools.next only. Reviewed-by: Leo Yan Reviewed-by: Mike Leach Signed-off-by: James Clark Acked-by: Suzuki Poulouse Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20230612111403.100613-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 118 +++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 80 deletions(-) diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 416f2ddc3895..83881c80ea06 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -46,8 +46,6 @@ struct cs_etm_auxtrace { struct auxtrace_heap heap; struct itrace_synth_opts synth_opts; struct perf_session *session; - struct machine *machine; - struct thread *unknown_thread; struct perf_tsc_conversion tc; /* @@ -84,7 +82,6 @@ struct cs_etm_auxtrace { struct cs_etm_traceid_queue { u8 trace_chan_id; - pid_t pid, tid; u64 period_instructions; size_t last_branch_pos; union perf_event *event_buf; @@ -480,9 +477,9 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq, cs_etm__clear_packet_queue(&tidq->packet_queue); queue = &etmq->etm->queues.queue_array[etmq->queue_nr]; - tidq->tid = queue->tid; - tidq->pid = -1; tidq->trace_chan_id = trace_chan_id; + tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1, + queue->tid); tidq->packet = zalloc(sizeof(struct cs_etm_packet)); if (!tidq->packet) @@ -863,7 +860,6 @@ static void cs_etm__free(struct perf_session *session) for (i = 0; i < aux->num_cpu; i++) zfree(&aux->metadata[i]); - thread__zput(aux->unknown_thread); zfree(&aux->metadata); zfree(&aux); } @@ -882,7 +878,7 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) { struct machine *machine; - machine = etmq->etm->machine; + machine = &etmq->etm->session->machines.host; if (address >= machine__kernel_start(machine)) { if (machine__is_host(machine)) @@ -905,8 +901,6 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, u8 cpumode; u64 offset; int len; - struct thread *thread; - struct machine *machine; struct addr_location al; struct dso *dso; struct cs_etm_traceid_queue *tidq; @@ -916,20 +910,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, return 0; addr_location__init(&al); - machine = etmq->etm->machine; cpumode = cs_etm__cpu_mode(etmq, address); tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id); if (!tidq) goto out; - thread = tidq->thread; - if (!thread) { - if (cpumode != PERF_RECORD_MISC_KERNEL) - goto out; - thread = etmq->etm->unknown_thread; - } - - if (!thread__find_map(thread, cpumode, address, &al)) + if (!thread__find_map(tidq->thread, cpumode, address, &al)) goto out; dso = map__dso(al.map); @@ -944,7 +930,8 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, map__load(al.map); - len = dso__data_read_offset(dso, machine, offset, buffer, size); + len = dso__data_read_offset(dso, maps__machine(thread__maps(tidq->thread)), + offset, buffer, size); if (len <= 0) { ui__warning_once("CS ETM Trace: Missing DSO. Use 'perf archive' or debuginfod to export data from the traced system.\n" @@ -1307,39 +1294,31 @@ cs_etm__get_trace(struct cs_etm_queue *etmq) return etmq->buf_len; } -static void cs_etm__set_pid_tid_cpu(struct cs_etm_auxtrace *etm, - struct cs_etm_traceid_queue *tidq) +static void cs_etm__set_thread(struct cs_etm_auxtrace *etm, + struct cs_etm_traceid_queue *tidq, pid_t tid) { - if ((!tidq->thread) && (tidq->tid != -1)) - tidq->thread = machine__find_thread(etm->machine, -1, - tidq->tid); + struct machine *machine = &etm->session->machines.host; + + if (tid != -1) { + thread__zput(tidq->thread); + tidq->thread = machine__find_thread(machine, -1, tid); + } - if (tidq->thread) - tidq->pid = thread__pid(tidq->thread); + /* Couldn't find a known thread */ + if (!tidq->thread) + tidq->thread = machine__idle_thread(machine); } int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id) { - int cpu, err = -EINVAL; - struct cs_etm_auxtrace *etm = etmq->etm; struct cs_etm_traceid_queue *tidq; tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id); if (!tidq) - return err; - - if (cs_etm__get_cpu(trace_chan_id, &cpu) < 0) - return err; - - err = machine__set_current_tid(etm->machine, cpu, tid, tid); - if (err) - return err; - - tidq->tid = tid; - thread__zput(tidq->thread); + return -EINVAL; - cs_etm__set_pid_tid_cpu(etm, tidq); + cs_etm__set_thread(etmq->etm, tidq, tid); return 0; } @@ -1416,8 +1395,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, sample.time = cs_etm__resolve_sample_time(etmq, tidq); sample.ip = addr; - sample.pid = tidq->pid; - sample.tid = tidq->tid; + sample.pid = thread__pid(tidq->thread); + sample.tid = thread__tid(tidq->thread); sample.id = etmq->etm->instructions_id; sample.stream_id = etmq->etm->instructions_id; sample.period = period; @@ -1475,8 +1454,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, sample.time = cs_etm__resolve_sample_time(etmq, tidq); sample.ip = ip; - sample.pid = tidq->pid; - sample.tid = tidq->tid; + sample.pid = thread__pid(tidq->thread); + sample.tid = thread__tid(tidq->thread); sample.addr = cs_etm__first_executed_instr(tidq->packet); sample.id = etmq->etm->branches_id; sample.stream_id = etmq->etm->branches_id; @@ -2470,11 +2449,6 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm, if (!etmq) continue; - /* - * Per-cpu mode has contextIDs in the trace and the decoder - * calls cs_etm__set_pid_tid_cpu() automatically so no need - * to do this here - */ if (etm->per_thread_decoding) { tidq = cs_etm__etmq_get_traceid_queue( etmq, CS_ETM_PER_THREAD_TRACEID); @@ -2482,10 +2456,8 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm, if (!tidq) continue; - if ((tid == -1) || (tidq->tid == tid)) { - cs_etm__set_pid_tid_cpu(etm, tidq); + if (tid == -1 || thread__tid(tidq->thread) == tid) cs_etm__run_per_thread_timeless_decoder(etmq); - } } else cs_etm__run_per_cpu_timeless_decoder(etmq); } @@ -2615,10 +2587,12 @@ static int cs_etm__process_itrace_start(struct cs_etm_auxtrace *etm, return 0; /* - * Add the tid/pid to the log so that we can get a match when - * we get a contextID from the decoder. + * Add the tid/pid to the log so that we can get a match when we get a + * contextID from the decoder. Only track for the host: only kernel + * trace is supported for guests which wouldn't need pids so this should + * be fine. */ - th = machine__findnew_thread(etm->machine, + th = machine__findnew_thread(&etm->session->machines.host, event->itrace_start.pid, event->itrace_start.tid); if (!th) @@ -2651,10 +2625,12 @@ static int cs_etm__process_switch_cpu_wide(struct cs_etm_auxtrace *etm, return 0; /* - * Add the tid/pid to the log so that we can get a match when - * we get a contextID from the decoder. + * Add the tid/pid to the log so that we can get a match when we get a + * contextID from the decoder. Only track for the host: only kernel + * trace is supported for guests which wouldn't need pids so this should + * be fine. */ - th = machine__findnew_thread(etm->machine, + th = machine__findnew_thread(&etm->session->machines.host, event->context_switch.next_prev_pid, event->context_switch.next_prev_tid); if (!th) @@ -3263,7 +3239,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, } etm->session = session; - etm->machine = &session->machines.host; etm->num_cpu = num_cpu; etm->pmu_type = (unsigned int) ((ptr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff); @@ -3290,21 +3265,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, if (err) return err; - etm->unknown_thread = thread__new(999999999, 999999999); - if (!etm->unknown_thread) { - err = -ENOMEM; - goto err_free_queues; - } - - err = thread__set_comm(etm->unknown_thread, "unknown", 0); - if (err) - goto err_delete_thread; - - if (thread__init_maps(etm->unknown_thread, etm->machine)) { - err = -ENOMEM; - goto err_delete_thread; - } - etm->tc.time_shift = tc->time_shift; etm->tc.time_mult = tc->time_mult; etm->tc.time_zero = tc->time_zero; @@ -3316,7 +3276,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, } err = cs_etm__synth_events(etm, session); if (err) - goto err_delete_thread; + goto err_free_queues; /* * Map Trace ID values to CPU metadata. @@ -3346,7 +3306,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, session->header.data_size, cs_etm__process_aux_hw_id_cb, &aux_hw_id_found); if (err) - goto err_delete_thread; + goto err_free_queues; /* if HW ID found then clear any unused metadata ID values */ if (aux_hw_id_found) @@ -3356,17 +3316,15 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, err = cs_etm__map_trace_ids_metadata(num_cpu, metadata); if (err) - goto err_delete_thread; + goto err_free_queues; err = cs_etm__queue_aux_records(session); if (err) - goto err_delete_thread; + goto err_free_queues; etm->data_queued = etm->queues.populated; return 0; -err_delete_thread: - thread__zput(etm->unknown_thread); err_free_queues: auxtrace_queues__free(&etm->queues); session->auxtrace = NULL; -- cgit v1.2.3 From d67d8c87d0e3c808e6c716ab59f981f7d0ec2cbd Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 12 Jun 2023 12:13:59 +0100 Subject: perf cs-etm: Use previous thread for branch sample source IP Branch samples currently use the IP of the previous packet as the from IP, and the IP of the current packet as the to IP. But it incorrectly uses the current thread. In some cases like a jump into a different exception level this will attribute to the incorrect process. Fix it by tracking the previous thread in the same way the previous packet is tracked. Committer notes: Resolved conflicts with: perf addr_location: Add init/exit/copy functions perf thread: Add accessor functions for thread Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20230612111403.100613-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 83881c80ea06..da22732e50f6 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -86,6 +86,7 @@ struct cs_etm_traceid_queue { size_t last_branch_pos; union perf_event *event_buf; struct thread *thread; + struct thread *prev_packet_thread; struct branch_stack *last_branch; struct branch_stack *last_branch_rb; struct cs_etm_packet *prev_packet; @@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq, tidq->trace_chan_id = trace_chan_id; tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1, queue->tid); + tidq->prev_packet_thread = machine__idle_thread(&etm->session->machines.host); tidq->packet = zalloc(sizeof(struct cs_etm_packet)); if (!tidq->packet) @@ -612,10 +614,20 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm, /* * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for * the next incoming packet. + * + * Threads and exception levels are also tracked for both the + * previous and current packets. This is because the previous + * packet is used for the 'from' IP for branch samples, so the + * thread at that time must also be assigned to that sample. + * Across discontinuity packets the thread can change, so by + * tracking the thread for the previous packet the branch sample + * will have the correct info. */ tmp = tidq->packet; tidq->packet = tidq->prev_packet; tidq->prev_packet = tmp; + thread__put(tidq->prev_packet_thread); + tidq->prev_packet_thread = thread__get(tidq->thread); } } @@ -791,6 +803,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq) /* Free this traceid_queue from the array */ tidq = etmq->traceid_queues[idx]; thread__zput(tidq->thread); + thread__zput(tidq->prev_packet_thread); zfree(&tidq->event_buf); zfree(&tidq->last_branch); zfree(&tidq->last_branch_rb); @@ -1454,8 +1467,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, sample.time = cs_etm__resolve_sample_time(etmq, tidq); sample.ip = ip; - sample.pid = thread__pid(tidq->thread); - sample.tid = thread__tid(tidq->thread); + sample.pid = thread__pid(tidq->prev_packet_thread); + sample.tid = thread__tid(tidq->prev_packet_thread); sample.addr = cs_etm__first_executed_instr(tidq->packet); sample.id = etmq->etm->branches_id; sample.stream_id = etmq->etm->branches_id; -- cgit v1.2.3 From 5414b532611b19671cb10813e5d56e011574d698 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 12 Jun 2023 12:14:00 +0100 Subject: perf cs-etm: Make PID format accessible from struct cs_etm_auxtrace To avoid every user of PID format having to use their own static local variable, cache it on initialisation and change the accessor to take struct cs_etm_auxtrace. Reviewed-by: Leo Yan Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230612111403.100613-4-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 20 +++--------- tools/perf/util/cs-etm.c | 42 +++++++++++++++---------- tools/perf/util/cs-etm.h | 8 ++++- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 82a27ab90c8b..2af641d26866 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -541,34 +541,22 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq, const uint8_t trace_chan_id) { pid_t tid = -1; - static u64 pid_fmt; - int ret; - - /* - * As all the ETMs run at the same exception level, the system should - * have the same PID format crossing CPUs. So cache the PID format - * and reuse it for sequential decoding. - */ - if (!pid_fmt) { - ret = cs_etm__get_pid_fmt(trace_chan_id, &pid_fmt); - if (ret) - return OCSD_RESP_FATAL_SYS_ERR; - } /* * Process the PE_CONTEXT packets if we have a valid contextID or VMID. * If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2 * as VMID, Bit ETM_OPT_CTXTID2 is set in this case. */ - switch (pid_fmt) { - case BIT(ETM_OPT_CTXTID): + switch (cs_etm__get_pid_fmt(etmq)) { + case CS_ETM_PIDFMT_CTXTID: if (elem->context.ctxt_id_valid) tid = elem->context.context_id; break; - case BIT(ETM_OPT_CTXTID2): + case CS_ETM_PIDFMT_CTXTID2: if (elem->context.vmid_valid) tid = elem->context.vmid; break; + case CS_ETM_PIDFMT_NONE: default: break; } diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index da22732e50f6..8c4d55a802b0 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -78,6 +78,7 @@ struct cs_etm_auxtrace { u64 instructions_id; u64 **metadata; unsigned int pmu_type; + enum cs_etm_pid_fmt pid_fmt; }; struct cs_etm_traceid_queue { @@ -170,44 +171,46 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu) } /* - * The returned PID format is presented by two bits: + * The returned PID format is presented as an enum: * - * Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced; - * Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced. + * CS_ETM_PIDFMT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced. + * CS_ETM_PIDFMT_CTXTID2: CONTEXTIDR_EL2 is traced. + * CS_ETM_PIDFMT_NONE: No context IDs * * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2 * are enabled at the same time when the session runs on an EL2 kernel. * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be * recorded in the trace data, the tool will selectively use * CONTEXTIDR_EL2 as PID. + * + * The result is cached in etm->pid_fmt so this function only needs to be called + * when processing the aux info. */ -int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt) +static enum cs_etm_pid_fmt cs_etm__init_pid_fmt(u64 *metadata) { - struct int_node *inode; - u64 *metadata, val; - - inode = intlist__find(traceid_list, trace_chan_id); - if (!inode) - return -EINVAL; - - metadata = inode->priv; + u64 val; if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) { val = metadata[CS_ETM_ETMCR]; /* CONTEXTIDR is traced */ if (val & BIT(ETM_OPT_CTXTID)) - *pid_fmt = BIT(ETM_OPT_CTXTID); + return CS_ETM_PIDFMT_CTXTID; } else { val = metadata[CS_ETMV4_TRCCONFIGR]; /* CONTEXTIDR_EL2 is traced */ if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT))) - *pid_fmt = BIT(ETM_OPT_CTXTID2); + return CS_ETM_PIDFMT_CTXTID2; /* CONTEXTIDR_EL1 is traced */ else if (val & BIT(ETM4_CFG_BIT_CTXTID)) - *pid_fmt = BIT(ETM_OPT_CTXTID); + return CS_ETM_PIDFMT_CTXTID; } - return 0; + return CS_ETM_PIDFMT_NONE; +} + +enum cs_etm_pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq) +{ + return etmq->etm->pid_fmt; } static int cs_etm__map_trace_id(u8 trace_chan_id, u64 *cpu_metadata) @@ -3239,6 +3242,13 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, goto err_free_metadata; } + /* + * As all the ETMs run at the same exception level, the system should + * have the same PID format crossing CPUs. So cache the PID format + * and reuse it for sequential decoding. + */ + etm->pid_fmt = cs_etm__init_pid_fmt(metadata[0]); + err = auxtrace_queues__init(&etm->queues); if (err) goto err_free_etm; diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index ecca40787ac9..2f47f4ec5b27 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -244,9 +244,15 @@ int cs_etm__process_auxtrace_info(union perf_event *event, struct perf_session *session); struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu); +enum cs_etm_pid_fmt { + CS_ETM_PIDFMT_NONE, + CS_ETM_PIDFMT_CTXTID, + CS_ETM_PIDFMT_CTXTID2 +}; + #ifdef HAVE_CSTRACE_SUPPORT int cs_etm__get_cpu(u8 trace_chan_id, int *cpu); -int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt); +enum pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq); int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id); bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq); -- cgit v1.2.3 From 8d3031d39fe84cce9ab74ee22309ec8c0433c4a1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 12 Jun 2023 12:14:01 +0100 Subject: perf cs-etm: Track exception level Currently we assume all trace belongs to the host machine so when the decoder should be looking at the guest kernel maps it can crash because it looks at the host ones instead. Avoid one scenario (guest kernel running at EL1) by assigning the default guest machine to this trace. For userspace trace it's still not possible to determine guest vs host, but the PIDs should help in this case. Committer notes: Fixed up conflict with: perf addr_location: Add init/exit/copy functions That was only on tmp.perf-tools-next. Reviewed-by: Leo Yan Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230612111403.100613-5-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 7 ++- tools/perf/util/cs-etm.c | 76 +++++++++++++++++++------ tools/perf/util/cs-etm.h | 7 ++- 3 files changed, 68 insertions(+), 22 deletions(-) diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 2af641d26866..44c49acd6bff 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -561,12 +561,13 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq, break; } + if (cs_etm__etmq_set_tid_el(etmq, tid, trace_chan_id, + elem->context.exception_level)) + return OCSD_RESP_FATAL_SYS_ERR; + if (tid == -1) return OCSD_RESP_CONT; - if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id)) - return OCSD_RESP_FATAL_SYS_ERR; - /* * A timestamp is generated after a PE_CONTEXT element so make sure * to rely on that coming one. diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 8c4d55a802b0..211e8b200f11 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -14,7 +14,6 @@ #include #include -#include #include #include "auxtrace.h" @@ -88,6 +87,8 @@ struct cs_etm_traceid_queue { union perf_event *event_buf; struct thread *thread; struct thread *prev_packet_thread; + ocsd_ex_level prev_packet_el; + ocsd_ex_level el; struct branch_stack *last_branch; struct branch_stack *last_branch_rb; struct cs_etm_packet *prev_packet; @@ -482,6 +483,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq, queue = &etmq->etm->queues.queue_array[etmq->queue_nr]; tidq->trace_chan_id = trace_chan_id; + tidq->el = tidq->prev_packet_el = ocsd_EL_unknown; tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1, queue->tid); tidq->prev_packet_thread = machine__idle_thread(&etm->session->machines.host); @@ -629,6 +631,7 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm, tmp = tidq->packet; tidq->packet = tidq->prev_packet; tidq->prev_packet = tmp; + tidq->prev_packet_el = tidq->el; thread__put(tidq->prev_packet_thread); tidq->prev_packet_thread = thread__get(tidq->thread); } @@ -890,11 +893,43 @@ static bool cs_etm__evsel_is_auxtrace(struct perf_session *session, return evsel->core.attr.type == aux->pmu_type; } -static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) +static struct machine *cs_etm__get_machine(struct cs_etm_queue *etmq, + ocsd_ex_level el) { - struct machine *machine; + enum cs_etm_pid_fmt pid_fmt = cs_etm__get_pid_fmt(etmq); - machine = &etmq->etm->session->machines.host; + /* + * For any virtualisation based on nVHE (e.g. pKVM), or host kernels + * running at EL1 assume everything is the host. + */ + if (pid_fmt == CS_ETM_PIDFMT_CTXTID) + return &etmq->etm->session->machines.host; + + /* + * Not perfect, but otherwise assume anything in EL1 is the default + * guest, and everything else is the host. Distinguishing between guest + * and host userspaces isn't currently supported either. Neither is + * multiple guest support. All this does is reduce the likeliness of + * decode errors where we look into the host kernel maps when it should + * have been the guest maps. + */ + switch (el) { + case ocsd_EL1: + return machines__find_guest(&etmq->etm->session->machines, + DEFAULT_GUEST_KERNEL_ID); + case ocsd_EL3: + case ocsd_EL2: + case ocsd_EL0: + case ocsd_EL_unknown: + default: + return &etmq->etm->session->machines.host; + } +} + +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address, + ocsd_ex_level el) +{ + struct machine *machine = cs_etm__get_machine(etmq, el); if (address >= machine__kernel_start(machine)) { if (machine__is_host(machine)) @@ -904,10 +939,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address) } else { if (machine__is_host(machine)) return PERF_RECORD_MISC_USER; - else if (perf_guest) + else { + /* + * Can't really happen at the moment because + * cs_etm__get_machine() will always return + * machines.host for any non EL1 trace. + */ return PERF_RECORD_MISC_GUEST_USER; - else - return PERF_RECORD_MISC_HYPERVISOR; + } } } @@ -926,11 +965,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, return 0; addr_location__init(&al); - cpumode = cs_etm__cpu_mode(etmq, address); tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id); if (!tidq) goto out; + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el); + if (!thread__find_map(tidq->thread, cpumode, address, &al)) goto out; @@ -1310,10 +1350,11 @@ cs_etm__get_trace(struct cs_etm_queue *etmq) return etmq->buf_len; } -static void cs_etm__set_thread(struct cs_etm_auxtrace *etm, - struct cs_etm_traceid_queue *tidq, pid_t tid) +static void cs_etm__set_thread(struct cs_etm_queue *etmq, + struct cs_etm_traceid_queue *tidq, pid_t tid, + ocsd_ex_level el) { - struct machine *machine = &etm->session->machines.host; + struct machine *machine = cs_etm__get_machine(etmq, el); if (tid != -1) { thread__zput(tidq->thread); @@ -1323,10 +1364,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm, /* Couldn't find a known thread */ if (!tidq->thread) tidq->thread = machine__idle_thread(machine); + + tidq->el = el; } -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, - pid_t tid, u8 trace_chan_id) +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, + u8 trace_chan_id, ocsd_ex_level el) { struct cs_etm_traceid_queue *tidq; @@ -1334,7 +1377,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, if (!tidq) return -EINVAL; - cs_etm__set_thread(etmq->etm, tidq, tid); + cs_etm__set_thread(etmq, tidq, tid, el); return 0; } @@ -1404,7 +1447,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, struct perf_sample sample = {.ip = 0,}; event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = cs_etm__cpu_mode(etmq, addr); + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el); event->sample.header.size = sizeof(struct perf_event_header); /* Set time field based on etm auxtrace config. */ @@ -1463,7 +1506,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, ip = cs_etm__last_executed_instr(tidq->prev_packet); event->sample.header.type = PERF_RECORD_SAMPLE; - event->sample.header.misc = cs_etm__cpu_mode(etmq, ip); + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, + tidq->prev_packet_el); event->sample.header.size = sizeof(struct perf_event_header); /* Set time field based on etm auxtrace config. */ diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index 2f47f4ec5b27..7cca37887917 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -251,10 +251,11 @@ enum cs_etm_pid_fmt { }; #ifdef HAVE_CSTRACE_SUPPORT +#include int cs_etm__get_cpu(u8 trace_chan_id, int *cpu); -enum pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq); -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, - pid_t tid, u8 trace_chan_id); +enum cs_etm_pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq); +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, + u8 trace_chan_id, ocsd_ex_level el); bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq); void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq, u8 trace_chan_id); -- cgit v1.2.3 From d927ef5004ef79e7fa6e85ff1f62f19fd4051988 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 12 Jun 2023 12:14:02 +0100 Subject: perf cs-etm: Add exception level consistency check Assert that our own tracking of the exception level matches what OpenCSD provides. OpenCSD doesn't distinguish between EL0 and EL1 in the memory access callback so the extra tracking was required. But a rough assert can still be done. Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230612111403.100613-6-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 6 ++-- tools/perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 ++- tools/perf/util/cs-etm.c | 41 ++++++++++++++++++------- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 44c49acd6bff..e917985bbbe6 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -52,15 +52,15 @@ struct cs_etm_decoder { static u32 cs_etm_decoder__mem_access(const void *context, const ocsd_vaddr_t address, - const ocsd_mem_space_acc_t mem_space __maybe_unused, + const ocsd_mem_space_acc_t mem_space, const u8 trace_chan_id, const u32 req_size, u8 *buffer) { struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context; - return decoder->mem_access(decoder->data, trace_chan_id, - address, req_size, buffer); + return decoder->mem_access(decoder->data, trace_chan_id, address, + req_size, buffer, mem_space); } int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder, diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index 21d403f55d96..272c2efe78ee 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -11,6 +11,7 @@ #define INCLUDE__CS_ETM_DECODER_H__ #include +#include #include struct cs_etm_decoder; @@ -19,7 +20,8 @@ struct cs_etm_packet_queue; struct cs_etm_queue; -typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *); +typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *, + const ocsd_mem_space_acc_t); struct cs_etmv3_trace_params { u32 reg_ctrl; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 211e8b200f11..1419b40dfbe8 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -951,7 +951,8 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address, } static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, - u64 address, size_t size, u8 *buffer) + u64 address, size_t size, u8 *buffer, + const ocsd_mem_space_acc_t mem_space) { u8 cpumode; u64 offset; @@ -969,6 +970,24 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id, if (!tidq) goto out; + /* + * We've already tracked EL along side the PID in cs_etm__set_thread() + * so double check that it matches what OpenCSD thinks as well. It + * doesn't distinguish between EL0 and EL1 for this mem access callback + * so we had to do the extra tracking. Skip validation if it's any of + * the 'any' values. + */ + if (!(mem_space == OCSD_MEM_SPACE_ANY || + mem_space == OCSD_MEM_SPACE_N || mem_space == OCSD_MEM_SPACE_S)) { + if (mem_space & OCSD_MEM_SPACE_EL1N) { + /* Includes both non secure EL1 and EL0 */ + assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0); + } else if (mem_space & OCSD_MEM_SPACE_EL2) + assert(tidq->el == ocsd_EL2); + else if (mem_space & OCSD_MEM_SPACE_EL3) + assert(tidq->el == ocsd_EL3); + } + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el); if (!thread__find_map(tidq->thread, cpumode, address, &al)) @@ -1219,8 +1238,8 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq, { u8 instrBytes[2]; - cs_etm__mem_access(etmq, trace_chan_id, addr, - ARRAY_SIZE(instrBytes), instrBytes); + cs_etm__mem_access(etmq, trace_chan_id, addr, ARRAY_SIZE(instrBytes), + instrBytes, 0); /* * T32 instruction size is indicated by bits[15:11] of the first * 16-bit word of the instruction: 0b11101, 0b11110 and 0b11111 @@ -1411,8 +1430,8 @@ static void cs_etm__copy_insn(struct cs_etm_queue *etmq, else sample->insn_len = 4; - cs_etm__mem_access(etmq, trace_chan_id, sample->ip, - sample->insn_len, (void *)sample->insn); + cs_etm__mem_access(etmq, trace_chan_id, sample->ip, sample->insn_len, + (void *)sample->insn, 0); } u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp) @@ -1965,8 +1984,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id, * so below only read 2 bytes as instruction size for T32. */ addr = end_addr - 2; - cs_etm__mem_access(etmq, trace_chan_id, addr, - sizeof(instr16), (u8 *)&instr16); + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr16), + (u8 *)&instr16, 0); if ((instr16 & 0xFF00) == 0xDF00) return true; @@ -1981,8 +2000,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id, * +---------+---------+-------------------------+ */ addr = end_addr - 4; - cs_etm__mem_access(etmq, trace_chan_id, addr, - sizeof(instr32), (u8 *)&instr32); + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32), + (u8 *)&instr32, 0); if ((instr32 & 0x0F000000) == 0x0F000000 && (instr32 & 0xF0000000) != 0xF0000000) return true; @@ -1998,8 +2017,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id, * +-----------------------+---------+-----------+ */ addr = end_addr - 4; - cs_etm__mem_access(etmq, trace_chan_id, addr, - sizeof(instr32), (u8 *)&instr32); + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32), + (u8 *)&instr32, 0); if ((instr32 & 0xFFE0001F) == 0xd4000001) return true; -- cgit v1.2.3 From 657a3efee43a29d13c4f30e4c8f6a178fd2bf14a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 11 Jun 2023 16:36:06 -0700 Subject: lib subcmd: Avoid memory leak in exclude_cmds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit exclude_cmds will shorten the cmds names array, before doing so free the removed entry. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20230611233610.953456-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/help.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index bf02d62a3b2b..a66fb1a1a312 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -66,6 +66,7 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); if (cmp < 0) { + zfree(&cmds->names[cj]); cmds->names[cj++] = cmds->names[ci++]; } else if (cmp == 0) { ci++; @@ -75,9 +76,12 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) } } - while (ci < cmds->cnt) + while (ci < cmds->cnt) { + zfree(&cmds->names[cj]); cmds->names[cj++] = cmds->names[ci++]; - + } + for (ci = cj; ci < cmds->cnt; ci++) + zfree(&cmds->names[ci]); cmds->cnt = cj; } -- cgit v1.2.3 From 0f0d1354a54cf679e773cae551b4523f5ec00c94 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 11 Jun 2023 16:36:07 -0700 Subject: perf help: Ensure clean_cmds is called on all paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid potential memory leaks. Committer notes: This is right before calling exit(1), so just to clean up memory leak checker detection. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20230611233610.953456-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/help-unknown-cmd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c index ab9e16123626..eab99ea6ac01 100644 --- a/tools/perf/util/help-unknown-cmd.c +++ b/tools/perf/util/help-unknown-cmd.c @@ -92,6 +92,7 @@ const char *help_unknown_cmd(const char *cmd) main_cmds.names[0] = NULL; clean_cmdnames(&main_cmds); + clean_cmdnames(&other_cmds); fprintf(stderr, "WARNING: You called a perf program named '%s', " "which does not exist.\n" "Continuing under the assumption that you meant '%s'\n", @@ -114,5 +115,7 @@ const char *help_unknown_cmd(const char *cmd) fprintf(stderr, "\t%s\n", main_cmds.names[i]->name); } end: + clean_cmdnames(&main_cmds); + clean_cmdnames(&other_cmds); exit(1); } -- cgit v1.2.3 From e6deda2e5a6a387437bcaeffa7bf4bc95fe8c446 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 11 Jun 2023 16:36:08 -0700 Subject: perf bench epoll: Fix missing frees/puts on the exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issues detected by leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20230611233610.953456-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/epoll-ctl.c | 5 +++++ tools/perf/bench/epoll-wait.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 521d1ff97b06..6bfffe83dde9 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -421,6 +421,11 @@ int bench_epoll_ctl(int argc, const char **argv) print_summary(); close(epollfd); + perf_cpu_map__put(cpu); + for (i = 0; i < nthreads; i++) + free(worker[i].fdmap); + + free(worker); return ret; errmem: err(EXIT_FAILURE, "calloc"); diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index c1cdf03c075d..cb5174b53940 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -549,6 +549,11 @@ int bench_epoll_wait(int argc, const char **argv) print_summary(); close(epollfd); + perf_cpu_map__put(cpu); + for (i = 0; i < nthreads; i++) + free(worker[i].fdmap); + + free(worker); return ret; errmem: err(EXIT_FAILURE, "calloc"); -- cgit v1.2.3 From 8351498d5204ef572ace0582c33b2302fe303c57 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 11 Jun 2023 16:36:09 -0700 Subject: perf bench futex: Avoid memory leaks from pthread_attr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove code sharing the pthread_attr_t and initialize/destroy pthread_attr_t when needed. This avoids the same attribute being set that leak sanitizer reports as a memory leak. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20230611233610.953456-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-lock-pi.c | 12 ++++++------ tools/perf/bench/futex-requeue.c | 12 ++++++------ tools/perf/bench/futex-wake-parallel.c | 19 +++++++++++-------- tools/perf/bench/futex-wake.c | 12 ++++++------ 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 2d0417949727..092cbd52db82 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -118,8 +118,7 @@ static void *workerfn(void *arg) return NULL; } -static void create_threads(struct worker *w, pthread_attr_t thread_attr, - struct perf_cpu_map *cpu) +static void create_threads(struct worker *w, struct perf_cpu_map *cpu) { cpu_set_t *cpuset; unsigned int i; @@ -133,6 +132,9 @@ static void create_threads(struct worker *w, pthread_attr_t thread_attr, size = CPU_ALLOC_SIZE(nrcpus); for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); worker[i].tid = i; if (params.multi) { @@ -154,6 +156,7 @@ static void create_threads(struct worker *w, pthread_attr_t thread_attr, CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); } + pthread_attr_destroy(&thread_attr); } CPU_FREE(cpuset); } @@ -163,7 +166,6 @@ int bench_futex_lock_pi(int argc, const char **argv) int ret = 0; unsigned int i; struct sigaction act; - pthread_attr_t thread_attr; struct perf_cpu_map *cpu; argc = parse_options(argc, argv, options, bench_futex_lock_pi_usage, 0); @@ -203,11 +205,9 @@ int bench_futex_lock_pi(int argc, const char **argv) cond_init(&thread_worker); threads_starting = params.nthreads; - pthread_attr_init(&thread_attr); gettimeofday(&bench__start, NULL); - create_threads(worker, thread_attr, cpu); - pthread_attr_destroy(&thread_attr); + create_threads(worker, cpu); mutex_lock(&thread_lock); while (threads_starting) diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index 69ad896f556c..c0035990a33c 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -121,8 +121,7 @@ static void *workerfn(void *arg __maybe_unused) return NULL; } -static void block_threads(pthread_t *w, - pthread_attr_t thread_attr, struct perf_cpu_map *cpu) +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) { cpu_set_t *cpuset; unsigned int i; @@ -137,6 +136,9 @@ static void block_threads(pthread_t *w, /* create and block all threads */ for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); CPU_ZERO_S(size, cpuset); CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); @@ -149,6 +151,7 @@ static void block_threads(pthread_t *w, CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); } + pthread_attr_destroy(&thread_attr); } CPU_FREE(cpuset); } @@ -165,7 +168,6 @@ int bench_futex_requeue(int argc, const char **argv) int ret = 0; unsigned int i, j; struct sigaction act; - pthread_attr_t thread_attr; struct perf_cpu_map *cpu; argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0); @@ -209,7 +211,6 @@ int bench_futex_requeue(int argc, const char **argv) init_stats(&requeued_stats); init_stats(&requeuetime_stats); - pthread_attr_init(&thread_attr); mutex_init(&thread_lock); cond_init(&thread_parent); cond_init(&thread_worker); @@ -219,7 +220,7 @@ int bench_futex_requeue(int argc, const char **argv) struct timeval start, end, runtime; /* create, launch & block all threads */ - block_threads(worker, thread_attr, cpu); + block_threads(worker, cpu); /* make sure all threads are already blocked */ mutex_lock(&thread_lock); @@ -301,7 +302,6 @@ int bench_futex_requeue(int argc, const char **argv) cond_destroy(&thread_parent); cond_destroy(&thread_worker); mutex_destroy(&thread_lock); - pthread_attr_destroy(&thread_attr); print_summary(); diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 6682e49d0ee0..5ab0234d74e6 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -95,10 +95,12 @@ static void *waking_workerfn(void *arg) return NULL; } -static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) +static void wakeup_threads(struct thread_data *td) { unsigned int i; + pthread_attr_t thread_attr; + pthread_attr_init(&thread_attr); pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE); pthread_barrier_init(&barrier, NULL, params.nwakes + 1); @@ -122,6 +124,7 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) err(EXIT_FAILURE, "pthread_join"); pthread_barrier_destroy(&barrier); + pthread_attr_destroy(&thread_attr); } static void *blocked_workerfn(void *arg __maybe_unused) @@ -142,8 +145,7 @@ static void *blocked_workerfn(void *arg __maybe_unused) return NULL; } -static void block_threads(pthread_t *w, pthread_attr_t thread_attr, - struct perf_cpu_map *cpu) +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) { cpu_set_t *cpuset; unsigned int i; @@ -158,6 +160,9 @@ static void block_threads(pthread_t *w, pthread_attr_t thread_attr, /* create and block all threads */ for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); CPU_ZERO_S(size, cpuset); CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); @@ -170,6 +175,7 @@ static void block_threads(pthread_t *w, pthread_attr_t thread_attr, CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); } + pthread_attr_destroy(&thread_attr); } CPU_FREE(cpuset); } @@ -238,7 +244,6 @@ int bench_futex_wake_parallel(int argc, const char **argv) int ret = 0; unsigned int i, j; struct sigaction act; - pthread_attr_t thread_attr; struct thread_data *waking_worker; struct perf_cpu_map *cpu; @@ -294,7 +299,6 @@ int bench_futex_wake_parallel(int argc, const char **argv) init_stats(&wakeup_stats); init_stats(&waketime_stats); - pthread_attr_init(&thread_attr); mutex_init(&thread_lock); cond_init(&thread_parent); cond_init(&thread_worker); @@ -305,7 +309,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) err(EXIT_FAILURE, "calloc"); /* create, launch & block all threads */ - block_threads(blocked_worker, thread_attr, cpu); + block_threads(blocked_worker, cpu); /* make sure all threads are already blocked */ mutex_lock(&thread_lock); @@ -317,7 +321,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) usleep(100000); /* Ok, all threads are patiently blocked, start waking folks up */ - wakeup_threads(waking_worker, thread_attr); + wakeup_threads(waking_worker); for (i = 0; i < params.nthreads; i++) { ret = pthread_join(blocked_worker[i], NULL); @@ -336,7 +340,6 @@ int bench_futex_wake_parallel(int argc, const char **argv) cond_destroy(&thread_parent); cond_destroy(&thread_worker); mutex_destroy(&thread_lock); - pthread_attr_destroy(&thread_attr); print_summary(); diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index 9ecab6620a87..18a5894af8bb 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -95,8 +95,7 @@ static void print_summary(void) rel_stddev_stats(waketime_stddev, waketime_avg)); } -static void block_threads(pthread_t *w, - pthread_attr_t thread_attr, struct perf_cpu_map *cpu) +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) { cpu_set_t *cpuset; unsigned int i; @@ -110,6 +109,9 @@ static void block_threads(pthread_t *w, /* create and block all threads */ for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); CPU_ZERO_S(size, cpuset); CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); @@ -122,6 +124,7 @@ static void block_threads(pthread_t *w, CPU_FREE(cpuset); err(EXIT_FAILURE, "pthread_create"); } + pthread_attr_destroy(&thread_attr); } CPU_FREE(cpuset); } @@ -138,7 +141,6 @@ int bench_futex_wake(int argc, const char **argv) int ret = 0; unsigned int i, j; struct sigaction act; - pthread_attr_t thread_attr; struct perf_cpu_map *cpu; argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0); @@ -178,7 +180,6 @@ int bench_futex_wake(int argc, const char **argv) init_stats(&wakeup_stats); init_stats(&waketime_stats); - pthread_attr_init(&thread_attr); mutex_init(&thread_lock); cond_init(&thread_parent); cond_init(&thread_worker); @@ -188,7 +189,7 @@ int bench_futex_wake(int argc, const char **argv) struct timeval start, end, runtime; /* create, launch & block all threads */ - block_threads(worker, thread_attr, cpu); + block_threads(worker, cpu); /* make sure all threads are already blocked */ mutex_lock(&thread_lock); @@ -228,7 +229,6 @@ int bench_futex_wake(int argc, const char **argv) cond_destroy(&thread_parent); cond_destroy(&thread_worker); mutex_destroy(&thread_lock); - pthread_attr_destroy(&thread_attr); print_summary(); -- cgit v1.2.3 From e57d739334d55688bfbf161b1501426467d02c86 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 11 Jun 2023 16:36:10 -0700 Subject: perf bench sched messaging: Free contexts on exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Place sender and receiver contexts onto lists so that they may be freed on exit. Add missing pthread_attr_destroy. Fixes memory leaks reported by leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20230611233610.953456-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/sched-messaging.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index 488f6e6ba1a5..fa1f8f998814 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #define DATASIZE 100 @@ -35,8 +36,11 @@ static bool use_pipes = false; static unsigned int nr_loops = 100; static bool thread_mode = false; static unsigned int num_groups = 10; +static struct list_head sender_contexts = LIST_HEAD_INIT(sender_contexts); +static struct list_head receiver_contexts = LIST_HEAD_INIT(receiver_contexts); struct sender_context { + struct list_head list; unsigned int num_fds; int ready_out; int wakefd; @@ -44,6 +48,7 @@ struct sender_context { }; struct receiver_context { + struct list_head list; unsigned int num_packets; int in_fds[2]; int ready_out; @@ -170,6 +175,7 @@ static pthread_t create_worker(void *ctx, void *(*func)(void *)) if (ret != 0) err(EXIT_FAILURE, "pthread_create failed"); + pthread_attr_destroy(&attr); return childid; } @@ -201,6 +207,7 @@ static unsigned int group(pthread_t *pth, if (!snd_ctx) err(EXIT_FAILURE, "malloc()"); + list_add(&snd_ctx->list, &sender_contexts); for (i = 0; i < num_fds; i++) { int fds[2]; struct receiver_context *ctx = malloc(sizeof(*ctx)); @@ -208,6 +215,7 @@ static unsigned int group(pthread_t *pth, if (!ctx) err(EXIT_FAILURE, "malloc()"); + list_add(&ctx->list, &receiver_contexts); /* Create the pipe between client and server */ fdpair(fds); @@ -266,6 +274,7 @@ int bench_sched_messaging(int argc, const char **argv) int readyfds[2], wakefds[2]; char dummy; pthread_t *pth_tab; + struct sender_context *pos, *n; argc = parse_options(argc, argv, options, bench_sched_message_usage, 0); @@ -324,6 +333,13 @@ int bench_sched_messaging(int argc, const char **argv) } free(pth_tab); - + list_for_each_entry_safe(pos, n, &sender_contexts, list) { + list_del_init(&pos->list); + free(pos); + } + list_for_each_entry_safe(pos, n, &receiver_contexts, list) { + list_del_init(&pos->list); + free(pos); + } return 0; } -- cgit v1.2.3 From 232418a0b2e8b8e72dac003b19352f1b647cdb31 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 20:43:19 -0700 Subject: perf sched: Avoid large stack allocations Commit 5ded57ac1bdb ("perf inject: Remove static variables") moved static variables to local, however, in this case 3 MAX_CPUS (4096) sized arrays were moved onto the stack making the stack frame quite large. Avoid the stack usage by dynamically allocating the arrays. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230527034324.2597593-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index c9ddf73689cd..9ab300b6f131 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -193,8 +193,8 @@ struct perf_sched { * weird events, such as a task being switched away that is not current. */ struct perf_cpu max_cpu; - u32 curr_pid[MAX_CPUS]; - struct thread *curr_thread[MAX_CPUS]; + u32 *curr_pid; + struct thread **curr_thread; char next_shortname1; char next_shortname2; unsigned int replay_repeat; @@ -224,7 +224,7 @@ struct perf_sched { u64 run_avg; u64 all_runtime; u64 all_count; - u64 cpu_last_switched[MAX_CPUS]; + u64 *cpu_last_switched; struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root; struct list_head sort_list, cmp_pid; bool force; @@ -3595,7 +3595,22 @@ int cmd_sched(int argc, const char **argv) mutex_init(&sched.start_work_mutex); mutex_init(&sched.work_done_wait_mutex); - for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++) + sched.curr_thread = calloc(MAX_CPUS, sizeof(*sched.curr_thread)); + if (!sched.curr_thread) { + ret = -ENOMEM; + goto out; + } + sched.cpu_last_switched = calloc(MAX_CPUS, sizeof(*sched.cpu_last_switched)); + if (!sched.cpu_last_switched) { + ret = -ENOMEM; + goto out; + } + sched.curr_pid = malloc(MAX_CPUS * sizeof(*sched.curr_pid)); + if (!sched.curr_pid) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MAX_CPUS; i++) sched.curr_pid[i] = -1; argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands, @@ -3664,6 +3679,9 @@ int cmd_sched(int argc, const char **argv) } out: + free(sched.curr_pid); + free(sched.cpu_last_switched); + free(sched.curr_thread); mutex_destroy(&sched.start_work_mutex); mutex_destroy(&sched.work_done_wait_mutex); -- cgit v1.2.3 From e590e46b548e0de3df52a8a093639ce67002fae0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 20:43:20 -0700 Subject: perf script: Remove some large stack allocations Some char buffers are stack allocated but in total they come to 24kb. Avoid Wstack-usage warnings by moving the arrays to being dynamically allocated. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230527034324.2597593-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index e3f435e6a7d0..200b3e7ea8da 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3318,14 +3318,21 @@ static int list_available_scripts(const struct option *opt __maybe_unused, int unset __maybe_unused) { struct dirent *script_dirent, *lang_dirent; - char scripts_path[MAXPATHLEN]; + char *buf, *scripts_path, *script_path, *lang_path, *first_half; DIR *scripts_dir, *lang_dir; - char script_path[MAXPATHLEN]; - char lang_path[MAXPATHLEN]; struct script_desc *desc; - char first_half[BUFSIZ]; char *script_root; + buf = malloc(3 * MAXPATHLEN + BUFSIZ); + if (!buf) { + pr_err("malloc failed\n"); + exit(-1); + } + scripts_path = buf; + script_path = buf + MAXPATHLEN; + lang_path = buf + 2 * MAXPATHLEN; + first_half = buf + 3 * MAXPATHLEN; + snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path()); scripts_dir = opendir(scripts_path); @@ -3334,6 +3341,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused, "open(%s) failed.\n" "Check \"PERF_EXEC_PATH\" env to set scripts dir.\n", scripts_path); + free(buf); exit(-1); } @@ -3364,6 +3372,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused, desc->half_liner ? desc->half_liner : ""); } + free(buf); exit(0); } -- cgit v1.2.3 From d3944f0ed4e4039201b160fc11004abaa2ca5385 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 20:43:21 -0700 Subject: perf inject: Lazily allocate event_copy The event_copy is 64kb (PERF_SAMPLE_SIZE_MAX) and stack allocated in struct perf_inject. It is used for aux events that may not exist in a file. Make the array allocation lazy to cut down on the stack usage. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230527034324.2597593-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index d19a1b862306..2023b7a0daa6 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -122,7 +122,7 @@ struct perf_inject { u64 aux_id; struct list_head samples; struct itrace_synth_opts itrace_synth_opts; - char event_copy[PERF_SAMPLE_MAX_SIZE]; + char *event_copy; struct perf_file_section secs[HEADER_FEAT_BITS]; struct guest_session guest_session; struct strlist *known_build_ids; @@ -320,8 +320,14 @@ perf_inject__cut_auxtrace_sample(struct perf_inject *inject, { size_t sz1 = sample->aux_sample.data - (void *)event; size_t sz2 = event->header.size - sample->aux_sample.size - sz1; - union perf_event *ev = (union perf_event *)inject->event_copy; + union perf_event *ev; + if (inject->event_copy == NULL) { + inject->event_copy = malloc(PERF_SAMPLE_MAX_SIZE); + if (!inject->event_copy) + return ERR_PTR(-ENOMEM); + } + ev = (union perf_event *)inject->event_copy; if (sz1 > event->header.size || sz2 > event->header.size || sz1 + sz2 > event->header.size || sz1 < sizeof(struct perf_event_header) + sizeof(u64)) @@ -357,8 +363,11 @@ static int perf_event__repipe_sample(struct perf_tool *tool, build_id__mark_dso_hit(tool, event, sample, evsel, machine); - if (inject->itrace_synth_opts.set && sample->aux_sample.size) + if (inject->itrace_synth_opts.set && sample->aux_sample.size) { event = perf_inject__cut_auxtrace_sample(inject, event, sample); + if (IS_ERR(event)) + return PTR_ERR(event); + } return perf_event__repipe_synth(tool, event); } @@ -2391,5 +2400,6 @@ out_close_output: if (!inject.in_place_update) perf_data__close(&inject.output); free(inject.itrace_synth_opts.vm_tm_corr_args); + free(inject.event_copy); return ret; } -- cgit v1.2.3 From 892d00fba18a6dec2620165ce05e1697496f8381 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 26 May 2023 20:43:22 -0700 Subject: perf inject: Lazily allocate guest_event event_buf The event_buf is 64kb (PERF_SAMPLE_SIZE_MAX) and stack allocated in struct perf_inject. It is used for guest events that may not exist in a file. Make the array allocation lazy to cut down on the stack usage. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230527034324.2597593-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 2023b7a0daa6..c8cf2fdd9cff 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -47,7 +47,7 @@ struct guest_event { struct perf_sample sample; union perf_event *event; - char event_buf[PERF_SAMPLE_MAX_SIZE]; + char *event_buf; }; struct guest_id { @@ -1374,11 +1374,19 @@ static void guest_session__convert_time(struct guest_session *gs, u64 guest_time static int guest_session__fetch(struct guest_session *gs) { - void *buf = gs->ev.event_buf; - struct perf_event_header *hdr = buf; + void *buf; + struct perf_event_header *hdr; size_t hdr_sz = sizeof(*hdr); ssize_t ret; + buf = gs->ev.event_buf; + if (!buf) { + buf = malloc(PERF_SAMPLE_MAX_SIZE); + if (!buf) + return -ENOMEM; + gs->ev.event_buf = buf; + } + hdr = buf; ret = readn(gs->tmp_fd, buf, hdr_sz); if (ret < 0) return ret; @@ -2401,5 +2409,6 @@ out_close_output: perf_data__close(&inject.output); free(inject.itrace_synth_opts.vm_tm_corr_args); free(inject.event_copy); + free(inject.guest_session.ev.event_buf); return ret; } -- cgit v1.2.3 From 103b3d2f94732fb1bc796e68e4cdfbcd731bbeaa Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Jun 2023 16:00:26 -0700 Subject: perf annotate: Allow whitespace between insn operands The llvm-objdump adds a space between the operands while GNU objdump does not. Allow a space to handle the both. In GNU objdump: Disassembly of section .text: here | ffffffff81000000 <_stext>: v ffffffff81000000: 48 8d 25 51 1f 40 01 lea 0x1401f51(%rip),%rsp ffffffff81000007: e8 d4 00 00 00 call ffffffff810000e0 ffffffff8100000c: 48 8d 3d ed ff ff ff lea -0x13(%rip),%rdi In llvm-objdump: Disassembly of section .text: here | ffffffff81000000 : v ffffffff81000000: 48 8d 25 51 1f 40 01 leaq 20979537(%rip), %rsp ffffffff81000007: e8 d4 00 00 00 callq 0xffffffff810000e0 ffffffff8100000c: 48 8d 3d ed ff ff ff leaq -19(%rip), %rdi Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230612230026.3887586-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index bde890cfa620..cdd1924a4418 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -579,7 +579,7 @@ static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_sy if (ops->source.raw == NULL) return -1; - target = ++s; + target = skip_spaces(++s); comment = strchr(s, arch->objdump.comment_char); if (comment != NULL) -- cgit v1.2.3 From d15b8c76c964e882593365a5d1b4b924c945b90e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 Jun 2023 21:56:09 -0300 Subject: perf pfm: Remove duplicate util/cpumap.h include Fixes: d1f1cecc92ae0dba ("perf list: Check if libpfm4 event is supported") Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202306110636.2sTsiAcl-lkp@intel.com/ Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pfm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/pfm.c b/tools/perf/util/pfm.c index 4c1024c343dd..862e4a689868 100644 --- a/tools/perf/util/pfm.c +++ b/tools/perf/util/pfm.c @@ -13,7 +13,6 @@ #include "util/pmus.h" #include "util/pfm.h" #include "util/strbuf.h" -#include "util/cpumap.h" #include "util/thread_map.h" #include -- cgit v1.2.3 From 3abfcfd847717d232e36963f31a361747c388fe7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Jun 2023 16:41:01 -0700 Subject: perf dwarf-aux: Fix off-by-one in die_get_varname() The die_get_varname() returns "(unknown_type)" string if it failed to find a type for the variable. But it had a space before the opening parenthesis and it made the closing parenthesis cut off due to the off-by-one in the string length (14). Signed-off-by: Namhyung Kim Fixes: 88fd633cdfa19060 ("perf probe: No need to use formatting strbuf method") Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230612234102.3909116-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index b07414409771..3bff67874563 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1103,7 +1103,7 @@ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf) ret = die_get_typename(vr_die, buf); if (ret < 0) { pr_debug("Failed to get type, make it unknown.\n"); - ret = strbuf_add(buf, " (unknown_type)", 14); + ret = strbuf_add(buf, "(unknown_type)", 14); } return ret < 0 ? ret : strbuf_addf(buf, "\t%s", dwarf_diename(vr_die)); -- cgit v1.2.3 From 7f911905ffe62e4fb7274f1f09f4148a449b2f83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Jun 2023 16:41:02 -0700 Subject: perf dwarf-aux: Allow unnamed struct/union/enum It's possible some struct/union/enum type don't have type name. Allow the empty name after "struct"/"union"/"enum" string rather than fail. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230612234102.3909116-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 3bff67874563..45e018c0ebf5 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1074,16 +1074,18 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) /* Function pointer */ return strbuf_add(buf, "(function_type)", 15); } else { - if (!dwarf_diename(&type)) - return -ENOENT; + const char *name = dwarf_diename(&type); + if (tag == DW_TAG_union_type) tmp = "union "; else if (tag == DW_TAG_structure_type) tmp = "struct "; else if (tag == DW_TAG_enumeration_type) tmp = "enum "; + else if (name == NULL) + return -ENOENT; /* Write a base name */ - return strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type)); + return strbuf_addf(buf, "%s%s", tmp, name ?: ""); } ret = die_get_typename(&type, buf); return ret ? ret : strbuf_addstr(buf, tmp); -- cgit v1.2.3 From d436373a75f53cafa37df0ace3b329b119739699 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 13 Jun 2023 16:22:26 +0300 Subject: perf tests: Make x86 new instructions test optional at build time The "x86 instruction decoder - new instructions" test takes up space but is only really useful to developers. Make it optional at build time. Add variable EXTRA_TESTS which must be defined in order to build perf with the test. Example: Before: $ make -C tools/perf clean >/dev/null $ make -C tools/perf >/dev/null Makefile.config:650: No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR Makefile.config:1149: libpfm4 not found, disables libpfm4 support. Please install libpfm4-dev PERF_VERSION = 6.4.rc3.gd15b8c76c964 $ readelf -SW tools/perf/perf | grep '\.rela.dyn\|.rodata\|\.data.rel.ro' [10] .rela.dyn RELA 000000000002fcb0 02fcb0 0748b0 18 A 6 0 8 [18] .rodata PROGBITS 00000000002eb000 2eb000 6bac00 00 A 0 0 32 [25] .data.rel.ro PROGBITS 00000000009ea180 9e9180 04b540 00 WA 0 0 32 After: $ make -C tools/perf clean >/dev/null $ make -C tools/perf >/dev/null Makefile.config:650: No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR Makefile.config:1154: libpfm4 not found, disables libpfm4 support. Please install libpfm4-dev PERF_VERSION = 6.4.rc3.g4ea9c1569ea4 $ readelf -SW tools/perf/perf | grep '\.rela.dyn\|.rodata\|\.data.rel.ro' [10] .rela.dyn RELA 000000000002f3c8 02f3c8 036d68 18 A 6 0 8 [18] .rodata PROGBITS 00000000002ac000 2ac000 68da80 00 A 0 0 32 [25] .data.rel.ro PROGBITS 000000000097d440 97c440 022280 00 WA 0 0 32 Committer notes: Build with 'make EXTRA_TESTS=1 -C tools/perf O=/tmp/build/perf" and reproduced the ELF section size differences. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/683fea7c-f5e9-fa20-f96b-f6233ed5d2a7@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 5 +++++ tools/perf/Makefile.perf | 4 ++++ tools/perf/arch/x86/include/arch-tests.h | 2 ++ tools/perf/arch/x86/tests/Build | 5 ++++- tools/perf/arch/x86/tests/arch-tests.c | 4 ++++ tools/perf/tests/make | 1 + 6 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index a794d9eca93d..9c5aa14a44cf 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1075,6 +1075,11 @@ ifndef NO_AUXTRACE endif endif +ifdef EXTRA_TESTS + $(call detected,CONFIG_EXTRA_TESTS) + CFLAGS += -DHAVE_EXTRA_TESTS +endif + ifndef NO_JVMTI ifneq (,$(wildcard /usr/sbin/update-java-alternatives)) JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | awk '{print $$3}') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index f48794816d82..b1e62a621f92 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -128,6 +128,10 @@ include ../scripts/utilities.mak # # Define BUILD_NONDISTRO to enable building an linking against libbfd and # libiberty distribution license incompatible libraries. +# +# Define EXTRA_TESTS to enable building extra tests useful mainly to perf +# developers, such as: +# x86 instruction decoder - new instructions test # As per kernel Makefile, avoid funny character set dependencies unexport LC_ALL diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h index 33d39c1d3e64..df133020d582 100644 --- a/tools/perf/arch/x86/include/arch-tests.h +++ b/tools/perf/arch/x86/include/arch-tests.h @@ -6,7 +6,9 @@ struct test_suite; /* Tests */ int test__rdpmc(struct test_suite *test, int subtest); +#ifdef HAVE_EXTRA_TESTS int test__insn_x86(struct test_suite *test, int subtest); +#endif int test__intel_pt_pkt_decoder(struct test_suite *test, int subtest); int test__intel_pt_hybrid_compat(struct test_suite *test, int subtest); int test__bp_modify(struct test_suite *test, int subtest); diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index 08cc8b9c931e..394771c00dca 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -4,5 +4,8 @@ perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o perf-y += arch-tests.o perf-y += sample-parsing.o perf-y += hybrid.o -perf-$(CONFIG_AUXTRACE) += insn-x86.o intel-pt-test.o +perf-$(CONFIG_AUXTRACE) += intel-pt-test.o +ifeq ($(CONFIG_EXTRA_TESTS),y) +perf-$(CONFIG_AUXTRACE) += insn-x86.o +endif perf-$(CONFIG_X86_64) += bp-modify.o diff --git a/tools/perf/arch/x86/tests/arch-tests.c b/tools/perf/arch/x86/tests/arch-tests.c index 147ad0638bbb..3f2b90c59f92 100644 --- a/tools/perf/arch/x86/tests/arch-tests.c +++ b/tools/perf/arch/x86/tests/arch-tests.c @@ -4,7 +4,9 @@ #include "arch-tests.h" #ifdef HAVE_AUXTRACE_SUPPORT +#ifdef HAVE_EXTRA_TESTS DEFINE_SUITE("x86 instruction decoder - new instructions", insn_x86); +#endif static struct test_case intel_pt_tests[] = { TEST_CASE("Intel PT packet decoder", intel_pt_pkt_decoder), @@ -37,7 +39,9 @@ struct test_suite *arch_tests[] = { &suite__dwarf_unwind, #endif #ifdef HAVE_AUXTRACE_SUPPORT +#ifdef HAVE_EXTRA_TESTS &suite__insn_x86, +#endif &suite__intel_pt, #endif #if defined(__x86_64__) diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 8dd3f8090352..885cd321d67b 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -69,6 +69,7 @@ make_clean_all := clean all make_python_perf_so := $(python_perf_so) make_debug := DEBUG=1 make_nondistro := BUILD_NONDISTRO=1 +make_extra_tests := EXTRA_TESTS=1 make_no_libperl := NO_LIBPERL=1 make_no_libpython := NO_LIBPYTHON=1 make_no_scripts := NO_LIBPYTHON=1 NO_LIBPERL=1 -- cgit v1.2.3 From c7a0023a1495355e71177ebfae33d27ad97577c3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Jun 2023 20:48:16 -0700 Subject: perf srcline: Make addr2line configuration failure more verbose To aid debugging why it fails. Also, combine the loops for reading a line for the llvm/binutils cases. Signed-off-by: Ian Rogers Tested-by: Changbin Du Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Tom Rix Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/r/20230613034817.1356114-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index aec596a0b0bb..d477332586b2 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -443,7 +443,7 @@ enum a2l_style { LLVM, }; -static enum a2l_style addr2line_configure(struct child_process *a2l) +static enum a2l_style addr2line_configure(struct child_process *a2l, const char *dso_name) { static bool cached; static enum a2l_style style; @@ -452,6 +452,7 @@ static enum a2l_style addr2line_configure(struct child_process *a2l) char buf[128]; struct io io; int ch; + int lines; if (write(a2l->in, ",\n", 2) != 2) return BROKEN; @@ -461,19 +462,29 @@ static enum a2l_style addr2line_configure(struct child_process *a2l) if (ch == ',') { style = LLVM; cached = true; + lines = 1; } else if (ch == '?') { style = GNU_BINUTILS; cached = true; + lines = 2; } else { - style = BROKEN; + if (!symbol_conf.disable_add2line_warn) { + char *output = NULL; + size_t output_len; + + io__getline(&io, &output, &output_len); + pr_warning("%s %s: addr2line configuration failed\n", + __func__, dso_name); + pr_warning("\t%c%s", ch, output); + } + return BROKEN; } - do { + while (lines) { ch = io__get_char(&io); - } while (ch > 0 && ch != '\n'); - if (style == GNU_BINUTILS) { - do { - ch = io__get_char(&io); - } while (ch > 0 && ch != '\n'); + if (ch <= 0) + break; + if (ch == '\n') + lines--; } /* Ignore SIGPIPE in the event addr2line exits. */ signal(SIGPIPE, SIG_IGN); @@ -593,12 +604,9 @@ static int addr2line(const char *dso_name, u64 addr, pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name); goto out; } - a2l_style = addr2line_configure(a2l); - if (a2l_style == BROKEN) { - if (!symbol_conf.disable_add2line_warn) - pr_warning("%s: addr2line configuration failed\n", __func__); + a2l_style = addr2line_configure(a2l, dso_name); + if (a2l_style == BROKEN) goto out; - } /* * Send our request and then *deliberately* send something that can't be interpreted as -- cgit v1.2.3 From 8dc26b6f718a81188519b77033eea764c9b6f732 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Jun 2023 20:48:17 -0700 Subject: perf srcline: Make sentinel reading for binutils addr2line more robust The addr2line process is sent an address then multiple function, filename:line "records" are read. To detect the end of output a ',' is sent and for llvm-addr2line a ',' is then read back showing the end of addrline's output. For binutils addr2line the ',' translates to address 0 and we expect the bogus filename marker "??:0" (see filename_split) to be sent from addr2line. For some kernels address 0 may have a mapping and so a seemingly valid inline output is given and breaking the sentinel discovery: ``` $ addr2line -e vmlinux -f -i , __per_cpu_start ./arch/x86/kernel/cpu/common.c:1850 ``` To avoid this problem enable the address dumping for addr2line (the -a option). If an address of 0x0000000000000000 is read then this is the sentinel value working around the problem above. The filename_split still needs to check for "??:0" as bogus non-zero addresses also need handling. Reported-by: Changbin Du Signed-off-by: Ian Rogers Tested-by: Changbin Du Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Tom Rix Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/r/20230613034817.1356114-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 61 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index d477332586b2..b27b4b3c391b 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -408,7 +408,7 @@ static struct child_process *addr2line_subprocess_init(const char *addr2line_pat const char *argv[] = { addr2line_path ?: "addr2line", "-e", binary_path, - "-i", "-f", NULL + "-a", "-i", "-f", NULL }; struct child_process *a2l = zalloc(sizeof(*a2l)); int start_command_status = 0; @@ -463,10 +463,10 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char style = LLVM; cached = true; lines = 1; - } else if (ch == '?') { + } else if (ch == '0') { style = GNU_BINUTILS; cached = true; - lines = 2; + lines = 3; } else { if (!symbol_conf.disable_add2line_warn) { char *output = NULL; @@ -518,20 +518,64 @@ static int read_addr2line_record(struct io *io, if (line_nr != NULL) *line_nr = 0; + /* + * Read the first line. Without an error this will be either an address + * like 0x1234 or for llvm-addr2line the sentinal ',' character. + */ if (io__getline(io, &line, &line_len) < 0 || !line_len) goto error; - if (style == LLVM && line_len == 2 && line[0] == ',') { - zfree(&line); - return 0; + if (style == LLVM) { + if (line_len == 2 && line[0] == ',') { + zfree(&line); + return 0; + } + } else { + int zero_count = 0, non_zero_count = 0; + + /* The address should always start 0x. */ + if (line_len < 2 || line[0] != '0' || line[1] != 'x') + goto error; + + for (size_t i = 2; i < line_len; i++) { + if (line[i] == '0') + zero_count++; + else if (line[i] != '\n') + non_zero_count++; + } + if (!non_zero_count) { + int ch; + + if (!zero_count) { + /* Line was erroneous just '0x'. */ + goto error; + } + /* + * Line was 0x0..0, the sentinel for binutils. Remove + * the function and filename lines. + */ + zfree(&line); + do { + ch = io__get_char(io); + } while (ch > 0 && ch != '\n'); + do { + ch = io__get_char(io); + } while (ch > 0 && ch != '\n'); + return 0; + } } + /* Read the second function name line. */ + if (io__getline(io, &line, &line_len) < 0 || !line_len) + goto error; + if (function != NULL) *function = strdup(strim(line)); zfree(&line); line_len = 0; + /* Read the third filename and line number line. */ if (io__getline(io, &line, &line_len) < 0 || !line_len) goto error; @@ -635,8 +679,9 @@ static int addr2line(const char *dso_name, u64 addr, goto out; case 0: /* - * The first record was invalid, so return failure, but first read another - * record, since we asked a junk question and have to clear the answer out. + * The first record was invalid, so return failure, but first + * read another record, since we sent a sentinel ',' for the + * sake of detected the last inlined function. */ switch (read_addr2line_record(&io, a2l_style, NULL, NULL, NULL)) { case -1: -- cgit v1.2.3 From 97d5f2e9ee12cdc7214d5835d35c59404cfafee6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 9 Jun 2023 15:40:04 -0700 Subject: tools api fs: More thread safety for global filesystem variables Multiple threads, such as with "perf top", may race to initialize a file system path like hugetlbfs. The racy initialization of the path leads to at least memory leaks. To avoid this initialize each fs for reading the mount point path with pthread_once. Mounting the file system may also be racy, so introduce a mutex over the function. This does mean that the path is being accessed with and without a mutex, which is inherently racy but hopefully benign, especially as there are fewer callers to fs__mount. Remove the fs__entries by directly using global variables, this was done as no argument like the index can be passed to the init once routine. Issue found and tested with "perf top" and address sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20230609224004.180988-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 211 ++++++++++++++++++++------------------------------ 1 file changed, 86 insertions(+), 125 deletions(-) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 22d34a0be8b4..5cb0eeec2c8a 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include @@ -43,7 +45,7 @@ #define BPF_FS_MAGIC 0xcafe4a11 #endif -static const char * const sysfs__fs_known_mountpoints[] = { +static const char * const sysfs__known_mountpoints[] = { "/sys", 0, }; @@ -86,69 +88,70 @@ static const char * const bpf_fs__known_mountpoints[] = { }; struct fs { - const char *name; - const char * const *mounts; + const char * const name; + const char * const * const mounts; char *path; - bool found; - bool checked; - long magic; -}; - -enum { - FS__SYSFS = 0, - FS__PROCFS = 1, - FS__DEBUGFS = 2, - FS__TRACEFS = 3, - FS__HUGETLBFS = 4, - FS__BPF_FS = 5, + pthread_mutex_t mount_mutex; + const long magic; }; #ifndef TRACEFS_MAGIC #define TRACEFS_MAGIC 0x74726163 #endif -static struct fs fs__entries[] = { - [FS__SYSFS] = { - .name = "sysfs", - .mounts = sysfs__fs_known_mountpoints, - .magic = SYSFS_MAGIC, - .checked = false, - }, - [FS__PROCFS] = { - .name = "proc", - .mounts = procfs__known_mountpoints, - .magic = PROC_SUPER_MAGIC, - .checked = false, - }, - [FS__DEBUGFS] = { - .name = "debugfs", - .mounts = debugfs__known_mountpoints, - .magic = DEBUGFS_MAGIC, - .checked = false, - }, - [FS__TRACEFS] = { - .name = "tracefs", - .mounts = tracefs__known_mountpoints, - .magic = TRACEFS_MAGIC, - .checked = false, - }, - [FS__HUGETLBFS] = { - .name = "hugetlbfs", - .mounts = hugetlbfs__known_mountpoints, - .magic = HUGETLBFS_MAGIC, - .checked = false, - }, - [FS__BPF_FS] = { - .name = "bpf", - .mounts = bpf_fs__known_mountpoints, - .magic = BPF_FS_MAGIC, - .checked = false, - }, -}; +static void fs__init_once(struct fs *fs); +static const char *fs__mountpoint(const struct fs *fs); +static const char *fs__mount(struct fs *fs); + +#define FS(lower_name, fs_name, upper_name) \ +static struct fs fs__##lower_name = { \ + .name = #fs_name, \ + .mounts = lower_name##__known_mountpoints, \ + .magic = upper_name##_MAGIC, \ + .mount_mutex = PTHREAD_MUTEX_INITIALIZER, \ +}; \ + \ +static void lower_name##_init_once(void) \ +{ \ + struct fs *fs = &fs__##lower_name; \ + \ + fs__init_once(fs); \ +} \ + \ +const char *lower_name##__mountpoint(void) \ +{ \ + static pthread_once_t init_once = PTHREAD_ONCE_INIT; \ + struct fs *fs = &fs__##lower_name; \ + \ + pthread_once(&init_once, lower_name##_init_once); \ + return fs__mountpoint(fs); \ +} \ + \ +const char *lower_name##__mount(void) \ +{ \ + const char *mountpoint = lower_name##__mountpoint(); \ + struct fs *fs = &fs__##lower_name; \ + \ + if (mountpoint) \ + return mountpoint; \ + \ + return fs__mount(fs); \ +} \ + \ +bool lower_name##__configured(void) \ +{ \ + return lower_name##__mountpoint() != NULL; \ +} + +FS(sysfs, sysfs, SYSFS); +FS(procfs, procfs, PROC_SUPER); +FS(debugfs, debugfs, DEBUGFS); +FS(tracefs, tracefs, TRACEFS); +FS(hugetlbfs, hugetlbfs, HUGETLBFS); +FS(bpf_fs, bpf, BPF_FS); static bool fs__read_mounts(struct fs *fs) { - bool found = false; char type[100]; FILE *fp; char path[PATH_MAX + 1]; @@ -157,22 +160,17 @@ static bool fs__read_mounts(struct fs *fs) if (fp == NULL) return false; - while (!found && - fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n", + while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n", path, type) == 2) { if (strcmp(type, fs->name) == 0) { - free(fs->path); fs->path = strdup(path); - if (!fs->path) - return false; - found = true; + fclose(fp); + return fs->path != NULL; } } - fclose(fp); - fs->checked = true; - return fs->found = found; + return false; } static int fs__valid_mount(const char *fs, long magic) @@ -194,11 +192,9 @@ static bool fs__check_mounts(struct fs *fs) ptr = fs->mounts; while (*ptr) { if (fs__valid_mount(*ptr, fs->magic) == 0) { - free(fs->path); fs->path = strdup(*ptr); if (!fs->path) return false; - fs->found = true; return true; } ptr++; @@ -236,45 +232,26 @@ static bool fs__env_override(struct fs *fs) if (!override_path) return false; - free(fs->path); fs->path = strdup(override_path); if (!fs->path) return false; - fs->found = true; - fs->checked = true; return true; } -static const char *fs__get_mountpoint(struct fs *fs) +static void fs__init_once(struct fs *fs) { - if (fs__env_override(fs)) - return fs->path; - - if (fs__check_mounts(fs)) - return fs->path; - - if (fs__read_mounts(fs)) - return fs->path; - - return NULL; + if (!fs__env_override(fs) && + !fs__check_mounts(fs) && + !fs__read_mounts(fs)) { + assert(!fs->path); + } else { + assert(fs->path); + } } -static const char *fs__mountpoint(int idx) +static const char *fs__mountpoint(const struct fs *fs) { - struct fs *fs = &fs__entries[idx]; - - if (fs->found) - return (const char *)fs->path; - - /* the mount point was already checked for the mount point - * but and did not exist, so return NULL to avoid scanning again. - * This makes the found and not found paths cost equivalent - * in case of multiple calls. - */ - if (fs->checked) - return NULL; - - return fs__get_mountpoint(fs); + return fs->path; } static const char *mount_overload(struct fs *fs) @@ -289,45 +266,29 @@ static const char *mount_overload(struct fs *fs) return getenv(upper_name) ?: *fs->mounts; } -static const char *fs__mount(int idx) +static const char *fs__mount(struct fs *fs) { - struct fs *fs = &fs__entries[idx]; const char *mountpoint; - if (fs__mountpoint(idx)) - return (const char *)fs->path; + pthread_mutex_lock(&fs->mount_mutex); - mountpoint = mount_overload(fs); + /* Check if path found inside the mutex to avoid races with other callers of mount. */ + mountpoint = fs__mountpoint(fs); + if (mountpoint) + goto out; - if (mount(NULL, mountpoint, fs->name, 0, NULL) < 0) - return NULL; - - return fs__check_mounts(fs) ? fs->path : NULL; -} + mountpoint = mount_overload(fs); -#define FS(name, idx) \ -const char *name##__mountpoint(void) \ -{ \ - return fs__mountpoint(idx); \ -} \ - \ -const char *name##__mount(void) \ -{ \ - return fs__mount(idx); \ -} \ - \ -bool name##__configured(void) \ -{ \ - return name##__mountpoint() != NULL; \ + if (mount(NULL, mountpoint, fs->name, 0, NULL) == 0 && + fs__valid_mount(mountpoint, fs->magic) == 0) { + fs->path = strdup(mountpoint); + mountpoint = fs->path; + } +out: + pthread_mutex_unlock(&fs->mount_mutex); + return mountpoint; } -FS(sysfs, FS__SYSFS); -FS(procfs, FS__PROCFS); -FS(debugfs, FS__DEBUGFS); -FS(tracefs, FS__TRACEFS); -FS(hugetlbfs, FS__HUGETLBFS); -FS(bpf_fs, FS__BPF_FS); - int filename__read_int(const char *filename, int *value) { char line[64]; -- cgit v1.2.3 From e4c4e8a538a0db071d291bc2dca487e1882a7d4f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 7 Jun 2023 09:26:53 -0700 Subject: perf metric: Fix no group check The no group check fails if there is more than one meticgroup in the metricgroup_no_group. The first parameter of the match_metric() should be the string, while the substring should be the second parameter. Fixes: ccc66c6092802d68 ("perf metric: JSON flag to not group events if gathering a metric group") Signed-off-by: Kan Liang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230607162700.3234712-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 70ef2e23a710..74f2d8efc02d 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1175,7 +1175,7 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm, if (pm->metric_expr && match_pm_metric(pm, data->pmu, data->metric_name)) { bool metric_no_group = data->metric_no_group || - match_metric(data->metric_name, pm->metricgroup_no_group); + match_metric(pm->metricgroup_no_group, data->metric_name); data->has_match = true; ret = add_metric(data->list, pm, data->modifier, metric_no_group, -- cgit v1.2.3 From 6ec9503f45740b6ae4cb7f3e7441b1539c6d51d6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 13 Jun 2023 11:26:29 -0700 Subject: perf parse-events: Avoid string for PE_BP_COLON, PE_BP_SLASH There's no need to read the string ':' or '/' for PE_BP_COLON or PE_BP_SLASH and doing so causes parse-events.y to leak memory. The original patch has a committer note about not using these tokens presumably as yacc spotted they were a memory leak because no %destructor could be run. Remove the unused token workaround as there is now no value associated with these tokens. Fixes: f0617f526cb0c482 ("perf parse: Allow config terms with breakpoints") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rob Herring Link: https://lore.kernel.org/r/20230613182629.1500317-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.h | 4 ---- tools/perf/util/parse-events.l | 4 ++-- tools/perf/util/parse-events.y | 9 --------- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 5fdc1f33f57e..b0eb95f93e9c 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -228,10 +228,6 @@ void parse_events_error__handle(struct parse_events_error *err, int idx, void parse_events_error__print(struct parse_events_error *err, const char *event); -static inline void parse_events_unused_value(const void *x __maybe_unused) -{ -} - #ifdef HAVE_LIBELF_SUPPORT /* * If the probe point starts with '%', diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 7629af3d5c7c..99335ec586ae 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -315,13 +315,13 @@ r0x{num_raw_hex} { return str(yyscanner, PE_RAW); } * are the same, so trailing context can be used disambiguate the two * cases. */ -":"/{modifier_bp} { return str(yyscanner, PE_BP_COLON); } +":"/{modifier_bp} { return PE_BP_COLON; } /* * The slash before memory length can get mixed up with the slash before * config terms. Fortunately config terms do not start with a numeric * digit, so trailing context can be used disambiguate the two cases. */ -"/"/{digit} { return str(yyscanner, PE_BP_SLASH); } +"/"/{digit} { return PE_BP_SLASH; } "/"/{non_digit} { BEGIN(config); return '/'; } {num_dec} { return value(yyscanner, 10); } {num_hex} { return value(yyscanner, 16); } diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 0c3d086cc22a..9f28d4b5502f 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -80,8 +80,6 @@ static void free_list_evsel(struct list_head* list_evsel) %type PE_LEGACY_CACHE %type PE_MODIFIER_EVENT %type PE_MODIFIER_BP -%type PE_BP_COLON -%type PE_BP_SLASH %type PE_EVENT_NAME %type PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type PE_DRV_CFG_TERM @@ -510,9 +508,6 @@ PE_PREFIX_MEM PE_VALUE PE_BP_SLASH PE_VALUE PE_BP_COLON PE_MODIFIER_BP opt_event struct list_head *list; int err; - parse_events_unused_value(&$3); - parse_events_unused_value(&$5); - list = alloc_list(); ABORT_ON(!list); err = parse_events_add_breakpoint(_parse_state, list, @@ -531,8 +526,6 @@ PE_PREFIX_MEM PE_VALUE PE_BP_SLASH PE_VALUE opt_event_config struct list_head *list; int err; - parse_events_unused_value(&$3); - list = alloc_list(); ABORT_ON(!list); err = parse_events_add_breakpoint(_parse_state, list, @@ -550,8 +543,6 @@ PE_PREFIX_MEM PE_VALUE PE_BP_COLON PE_MODIFIER_BP opt_event_config struct list_head *list; int err; - parse_events_unused_value(&$3); - list = alloc_list(); ABORT_ON(!list); err = parse_events_add_breakpoint(_parse_state, list, -- cgit v1.2.3 From 0cd1ca4650c9cf5f318110f67d39cbebae3693b3 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 13 Jun 2023 15:25:04 +0530 Subject: perf tool x86: Consolidate is_amd check into single function There are multiple places where x86 specific code determines AMD vs Intel arch and acts based on that. Consolidate those checks into a single function. Signed-off-by: Ravi Bangoria Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ali Saidi Cc: Ananth Narayan Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Santosh Shukla Link: https://lore.kernel.org/r/20230613095506.547-3-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/env.c | 19 +++++++++++++++++++ tools/perf/arch/x86/util/env.h | 7 +++++++ tools/perf/arch/x86/util/evsel.c | 16 ++-------------- tools/perf/arch/x86/util/mem-events.c | 19 ++----------------- 5 files changed, 31 insertions(+), 31 deletions(-) create mode 100644 tools/perf/arch/x86/util/env.c create mode 100644 tools/perf/arch/x86/util/env.h diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 195ccfdef7aa..005907cb97d8 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -10,6 +10,7 @@ perf-y += evlist.o perf-y += mem-events.o perf-y += evsel.o perf-y += iostat.o +perf-y += env.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/env.c b/tools/perf/arch/x86/util/env.c new file mode 100644 index 000000000000..33b87f8ac1cc --- /dev/null +++ b/tools/perf/arch/x86/util/env.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "linux/string.h" +#include "util/env.h" +#include "env.h" + +bool x86__is_amd_cpu(void) +{ + struct perf_env env = { .total_mem = 0, }; + static int is_amd; /* 0: Uninitialized, 1: Yes, -1: No */ + + if (is_amd) + goto ret; + + perf_env__cpuid(&env); + is_amd = env.cpuid && strstarts(env.cpuid, "AuthenticAMD") ? 1 : -1; + +ret: + return is_amd >= 1 ? true : false; +} diff --git a/tools/perf/arch/x86/util/env.h b/tools/perf/arch/x86/util/env.h new file mode 100644 index 000000000000..d78f080b6b3f --- /dev/null +++ b/tools/perf/arch/x86/util/env.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_ENV_H +#define _X86_ENV_H + +bool x86__is_amd_cpu(void); + +#endif /* _X86_ENV_H */ diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index 25da46c8cca9..512c2d885d24 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -8,6 +8,7 @@ #include "linux/string.h" #include "evsel.h" #include "util/debug.h" +#include "env.h" #define IBS_FETCH_L3MISSONLY (1ULL << 59) #define IBS_OP_L3MISSONLY (1ULL << 16) @@ -78,23 +79,10 @@ void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr) { struct perf_pmu *evsel_pmu, *ibs_fetch_pmu, *ibs_op_pmu; static int warned_once; - /* 0: Uninitialized, 1: Yes, -1: No */ - static int is_amd; - if (warned_once || is_amd == -1) + if (warned_once || !x86__is_amd_cpu()) return; - if (!is_amd) { - struct perf_env *env = evsel__env(evsel); - - if (!perf_env__cpuid(env) || !env->cpuid || - !strstarts(env->cpuid, "AuthenticAMD")) { - is_amd = -1; - return; - } - is_amd = 1; - } - evsel_pmu = evsel__find_pmu(evsel); if (!evsel_pmu) return; diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c index 32879d12a8d5..a8a782bcb121 100644 --- a/tools/perf/arch/x86/util/mem-events.c +++ b/tools/perf/arch/x86/util/mem-events.c @@ -5,6 +5,7 @@ #include "map_symbol.h" #include "mem-events.h" #include "linux/string.h" +#include "env.h" static char mem_loads_name[100]; static bool mem_loads_name__init; @@ -27,28 +28,12 @@ static struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = { E("mem-ldst", "ibs_op//", "ibs_op"), }; -static int perf_mem_is_amd_cpu(void) -{ - struct perf_env env = { .total_mem = 0, }; - - perf_env__cpuid(&env); - if (env.cpuid && strstarts(env.cpuid, "AuthenticAMD")) - return 1; - return -1; -} - struct perf_mem_event *perf_mem_events__ptr(int i) { - /* 0: Uninitialized, 1: Yes, -1: No */ - static int is_amd; - if (i >= PERF_MEM_EVENTS__MAX) return NULL; - if (!is_amd) - is_amd = perf_mem_is_amd_cpu(); - - if (is_amd == 1) + if (x86__is_amd_cpu()) return &perf_mem_events_amd[i]; return &perf_mem_events_intel[i]; -- cgit v1.2.3 From 75782e825377bd2745c0231a0f3483888514acb6 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 13 Jun 2023 22:11:29 +0530 Subject: perf python scripting: Get rid of unused import in arm-cs-trace-disasm The arm-cs-trace-disasm.py script doesn't use the sys library, so remove the import. Report by pylint: W0611: Unused import sys (unused-import) Signed-off-by: Sourabh Jain Cc: Athira Rajeev Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/linux-perf-users/20230613164145.50488-2-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/arm-cs-trace-disasm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index 4339692a8d0b..d59ff53f1d94 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -9,7 +9,6 @@ from __future__ import print_function import os from os import path -import sys import re from subprocess import * from optparse import OptionParser, make_option -- cgit v1.2.3 From b3839ff1f40eba632177bc4775a35ed65a2262a6 Mon Sep 17 00:00:00 2001 From: Disha Goel Date: Tue, 13 Jun 2023 22:11:30 +0530 Subject: perf tests stat+json_output: Address shellcheck warnings Running shellcheck on stat+json_output testcase, generates below warning: [ $(id -u) != 0 ] && [ $(cat /proc/sys/kernel/perf_event_paranoid) -gt $1 ] ^------^ SC2046 (warning): Quote this to prevent word splitting. ^-- SC2046 (warning): Quote this to prevent word splitting. Fixed the warning by adding quotes to avoid word splitting. ShellCheck result with patch: # shellcheck -S warning stat+json_output.sh # perf test result after the change: 94: perf stat JSON output linter : Ok Signed-off-by: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/linux-perf-users/20230613164145.50488-3-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+json_output.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh index c282afa6217c..196e22672c50 100755 --- a/tools/perf/tests/shell/stat+json_output.sh +++ b/tools/perf/tests/shell/stat+json_output.sh @@ -40,7 +40,7 @@ trap trap_cleanup EXIT TERM INT # Return true if perf_event_paranoid is > $1 and not running as root. function ParanoidAndNotRoot() { - [ $(id -u) != 0 ] && [ $(cat /proc/sys/kernel/perf_event_paranoid) -gt $1 ] + [ "$(id -u)" != 0 ] && [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt $1 ] } check_no_args() -- cgit v1.2.3 From 1bb17b4c6c91ad4d9468247cf5f5464fa6440668 Mon Sep 17 00:00:00 2001 From: Spoorthy S Date: Tue, 13 Jun 2023 22:11:31 +0530 Subject: perf tests arm_callgraph_fp: Address shellcheck warnings about signal names and adding double quotes for expression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running shellcheck -S on test_arm_calligraph_fp throws warnings SC2086 and SC3049,       $shellcheck -S warning tests/shell/test_arm_callgraph_fp.sh          rm -f $PERF_DATA             : Double quote to prevent globbing and word splitting.          trap cleanup_files exit term int       : In POSIX sh, using lower/mixed case for signal names is undefined. After fixing the warnings,       $shellcheck tests/shell/test_arm_callgraph_fp.sh       $ echo $?       0 To address the POSIX shell warnings added changes to convert Lowercase signal names to uppercase in the script and double quoted the command substitutions($fix to "$fix") to solve Globbing warnings. Signed-off-by: Spoorthy S Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-4-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_callgraph_fp.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh index e61d8deaa0c4..1380e0d12dce 100755 --- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh +++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh @@ -9,13 +9,13 @@ TEST_PROGRAM="perf test -w leafloop" cleanup_files() { - rm -f $PERF_DATA + rm -f "$PERF_DATA" } -trap cleanup_files exit term int +trap cleanup_files EXIT TERM INT # Add a 1 second delay to skip samples that are not in the leaf() function -perf record -o $PERF_DATA --call-graph fp -e cycles//u -D 1000 --user-callchains -- $TEST_PROGRAM 2> /dev/null & +perf record -o "$PERF_DATA" --call-graph fp -e cycles//u -D 1000 --user-callchains -- "$TEST_PROGRAM" 2> /dev/null & PID=$! echo " + Recording (PID=$PID)..." -- cgit v1.2.3 From 5bd35dfb48b0af870093f2ee130883228b49352a Mon Sep 17 00:00:00 2001 From: Shirisha G Date: Tue, 13 Jun 2023 22:11:32 +0530 Subject: perf tests daemon: Address shellcheck warnings Running shellcheck -S on daemon.sh throws below warnings: Result from shellcheck: # shellcheck -S warning daemon.sh local line_name=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $2 }'` ^-------^ SC2155: Declare and assign separately to avoid masking return values. trap "echo 'FAILED: Signal caught'; daemon_exit ${config}; exit 1" SIGINT SIGTERM ^-------^ SC2064: Use single quotes, otherwise this expands now rather than when signalled. count=`ls ${base}/session-test/ | grep perf.data | wc -l` ^-- SC2010: Don't use ls | grep. Use a glob or a for loop with a condition to allow non-alphanumeric filenames. if [ ${size} != "OK" -o ${type} != "OK" ]; then ^-- SC2166: Prefer [ p ] || [ q ] as [ p -o q ] is not well defined. Fixed above warnings by: - declaring and assigning local variables separately - To fix SC2010, instead of using "ls | grep", used glob to allow non-alphanumeric filenames - Used single quotes to prevent expanding. Result from shellcheck after patch changes: $ shellcheck -S warning daemon.sh $ echo $? 0 Signed-off-by: Shirisha G Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-5-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/daemon.sh | 113 ++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 45fc24af5b07..4c598cfc5afa 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -11,11 +11,16 @@ check_line_first() local lock=$5 local up=$6 - local line_name=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $2 }'` - local line_base=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $3 }'` - local line_output=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $4 }'` - local line_lock=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $5 }'` - local line_up=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $6 }'` + local line_name + line_name=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $2 }'` + local line_base + line_base=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $3 }'` + local line_output + line_output=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $4 }'` + local line_lock + line_lock=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $5 }'` + local line_up + line_up=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $6 }'` if [ "${name}" != "${line_name}" ]; then echo "FAILED: wrong name" @@ -54,13 +59,20 @@ check_line_other() local ack=$7 local up=$8 - local line_name=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $2 }'` - local line_run=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $3 }'` - local line_base=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $4 }'` - local line_output=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $5 }'` - local line_control=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $6 }'` - local line_ack=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $7 }'` - local line_up=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $8 }'` + local line_name + line_name=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $2 }'` + local line_run + line_run=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $3 }'` + local line_base + line_base=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $4 }'` + local line_output + line_output=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $5 }'` + local line_control + line_control=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $6 }'` + local line_ack + line_ack=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $7 }'` + local line_up + line_up=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $8 }'` if [ "${name}" != "${line_name}" ]; then echo "FAILED: wrong name" @@ -102,8 +114,10 @@ daemon_exit() { local config=$1 - local line=`perf daemon --config ${config} -x: | head -1` - local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` + local line + line=`perf daemon --config ${config} -x: | head -1` + local pid + pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` # Reset trap handler. trap - SIGINT SIGTERM @@ -123,7 +137,7 @@ daemon_start() perf daemon start --config ${config} # Clean up daemon if interrupted. - trap "echo 'FAILED: Signal caught'; daemon_exit ${config}; exit 1" SIGINT SIGTERM + trap 'echo "FAILED: Signal caught"; daemon_exit "${config}"; exit 1' SIGINT SIGTERM # wait for the session to ping local state="FAIL" @@ -144,8 +158,10 @@ test_list() { echo "test daemon list" - local config=$(mktemp /tmp/perf.daemon.config.XXX) - local base=$(mktemp -d /tmp/perf.daemon.base.XXX) + local config + config=$(mktemp /tmp/perf.daemon.config.XXX) + local base + base=$(mktemp -d /tmp/perf.daemon.base.XXX) cat < ${config} [daemon] @@ -165,19 +181,22 @@ EOF # check first line # pid:daemon:base:base/output:base/lock - local line=`perf daemon --config ${config} -x: | head -1` + local line + line=`perf daemon --config ${config} -x: | head -1` check_line_first ${line} daemon ${base} ${base}/output ${base}/lock "0" # check 1st session # pid:size:-e cpu-clock:base/size:base/size/output:base/size/control:base/size/ack:0 - local line=`perf daemon --config ${config} -x: | head -2 | tail -1` + local line + line=`perf daemon --config ${config} -x: | head -2 | tail -1` check_line_other "${line}" size "-e cpu-clock -m 1 sleep 10" ${base}/session-size \ ${base}/session-size/output ${base}/session-size/control \ ${base}/session-size/ack "0" # check 2nd session # pid:time:-e task-clock:base/time:base/time/output:base/time/control:base/time/ack:0 - local line=`perf daemon --config ${config} -x: | head -3 | tail -1` + local line + line=`perf daemon --config ${config} -x: | head -3 | tail -1` check_line_other "${line}" time "-e task-clock -m 1 sleep 10" ${base}/session-time \ ${base}/session-time/output ${base}/session-time/control \ ${base}/session-time/ack "0" @@ -193,8 +212,10 @@ test_reconfig() { echo "test daemon reconfig" - local config=$(mktemp /tmp/perf.daemon.config.XXX) - local base=$(mktemp -d /tmp/perf.daemon.base.XXX) + local config + config=$(mktemp /tmp/perf.daemon.config.XXX) + local base + base=$(mktemp -d /tmp/perf.daemon.base.XXX) # prepare config cat < ${config} @@ -215,10 +236,12 @@ EOF # check 2nd session # pid:time:-e task-clock:base/time:base/time/output:base/time/control:base/time/ack:0 - local line=`perf daemon --config ${config} -x: | head -3 | tail -1` + local line + line=`perf daemon --config ${config} -x: | head -3 | tail -1` check_line_other "${line}" time "-e task-clock -m 1 sleep 10" ${base}/session-time \ ${base}/session-time/output ${base}/session-time/control ${base}/session-time/ack "0" - local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` + local pid + pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` # prepare new config local config_new=${config}.new @@ -249,7 +272,8 @@ EOF # check reconfigured 2nd session # pid:time:-e task-clock:base/time:base/time/output:base/time/control:base/time/ack:0 - local line=`perf daemon --config ${config} -x: | head -3 | tail -1` + local line + line=`perf daemon --config ${config} -x: | head -3 | tail -1` check_line_other "${line}" time "-e cpu-clock -m 1 sleep 10" ${base}/session-time \ ${base}/session-time/output ${base}/session-time/control ${base}/session-time/ack "0" @@ -276,7 +300,8 @@ EOF state=`perf daemon ping --config ${config} --session size | awk '{ print $1 }'` done - local one=`perf daemon --config ${config} -x: | wc -l` + local one + one=`perf daemon --config ${config} -x: | wc -l` if [ ${one} -ne "1" ]; then echo "FAILED: wrong list output" @@ -312,8 +337,10 @@ test_stop() { echo "test daemon stop" - local config=$(mktemp /tmp/perf.daemon.config.XXX) - local base=$(mktemp -d /tmp/perf.daemon.base.XXX) + local config + config=$(mktemp /tmp/perf.daemon.config.XXX) + local base + base=$(mktemp -d /tmp/perf.daemon.base.XXX) # prepare config cat < ${config} @@ -332,8 +359,12 @@ EOF # start daemon daemon_start ${config} size - local pid_size=`perf daemon --config ${config} -x: | head -2 | tail -1 | awk 'BEGIN { FS = ":" } ; { print $1 }'` - local pid_time=`perf daemon --config ${config} -x: | head -3 | tail -1 | awk 'BEGIN { FS = ":" } ; { print $1 }'` + local pid_size + pid_size=`perf daemon --config ${config} -x: | head -2 | tail -1 | + awk 'BEGIN { FS = ":" } ; { print $1 }'` + local pid_time + pid_time=`perf daemon --config ${config} -x: | head -3 | tail -1 | + awk 'BEGIN { FS = ":" } ; { print $1 }'` # check that sessions are running if [ ! -d "/proc/${pid_size}" ]; then @@ -364,8 +395,10 @@ test_signal() { echo "test daemon signal" - local config=$(mktemp /tmp/perf.daemon.config.XXX) - local base=$(mktemp -d /tmp/perf.daemon.base.XXX) + local config + config=$(mktemp /tmp/perf.daemon.config.XXX) + local base + base=$(mktemp -d /tmp/perf.daemon.base.XXX) # prepare config cat < ${config} @@ -389,7 +422,7 @@ EOF daemon_exit ${config} # count is 2 perf.data for signals and 1 for perf record finished - count=`ls ${base}/session-test/ | grep perf.data | wc -l` + count=`ls ${base}/session-test/*perf.data* | wc -l` if [ ${count} -ne 3 ]; then error=1 echo "FAILED: perf data no generated" @@ -403,8 +436,10 @@ test_ping() { echo "test daemon ping" - local config=$(mktemp /tmp/perf.daemon.config.XXX) - local base=$(mktemp -d /tmp/perf.daemon.base.XXX) + local config + config=$(mktemp /tmp/perf.daemon.config.XXX) + local base + base=$(mktemp -d /tmp/perf.daemon.base.XXX) # prepare config cat < ${config} @@ -426,7 +461,7 @@ EOF size=`perf daemon ping --config ${config} --session size | awk '{ print $1 }'` type=`perf daemon ping --config ${config} --session time | awk '{ print $1 }'` - if [ ${size} != "OK" -o ${type} != "OK" ]; then + if [ ${size} != "OK" ] || [ ${type} != "OK" ]; then error=1 echo "FAILED: daemon ping failed" fi @@ -442,8 +477,10 @@ test_lock() { echo "test daemon lock" - local config=$(mktemp /tmp/perf.daemon.config.XXX) - local base=$(mktemp -d /tmp/perf.daemon.base.XXX) + local config + config=$(mktemp /tmp/perf.daemon.config.XXX) + local base + base=$(mktemp -d /tmp/perf.daemon.base.XXX) # prepare config cat < ${config} -- cgit v1.2.3 From 9e9d07a71fa44ead54eda05754d17aa02f18b5b2 Mon Sep 17 00:00:00 2001 From: Korrapati Likhitha Date: Tue, 13 Jun 2023 22:11:33 +0530 Subject: perf tests stat+csv_output: Fix shellcheck warnings Running the shellcheck on stat+csv_output resulted in the following warning. Result with shellcheck without patch: ===== $ shellcheck -S warning stat+csv_output.sh In stat+csv_output.sh line 23: [ $(uname -m) = "s390x" ] && exp='^[6-7]$' ^---------^ SC2046: Quote this to prevent word splitting. In stat+csv_output.sh line 51: [ $(id -u) != 0 ] && [ $(cat /proc/sys/kernel/perf_event_paranoid) -gt $1 ] ^------^ SC2046: Quote this to prevent word splitting. ^-- SC2046: Quote this to prevent word splitting. ===== Fixed the warning SC2046 by adding quotes to prevent word splitting. Result with shellcheck with patch: ===== $ shellcheck -S warning tests/shell/stat+csv_output.sh $ ./perf test "stat CSV output linter" 96: perf stat CSV output linter : Ok ===== Signed-off-by: Korrapati Likhitha Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-6-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Sathvika Vasireddy Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+csv_output.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh index a1969f236a0a..ed082daf839c 100755 --- a/tools/perf/tests/shell/stat+csv_output.sh +++ b/tools/perf/tests/shell/stat+csv_output.sh @@ -35,7 +35,7 @@ function commachecker() ;; "--interval") exp=7 ;; "--per-thread") exp=7 ;; "--system-wide-no-aggr") exp=7 - [ $(uname -m) = "s390x" ] && exp='^[6-7]$' + [ "$(uname -m)" = "s390x" ] && exp='^[6-7]$' ;; "--per-core") exp=8 ;; "--per-socket") exp=8 ;; "--per-node") exp=8 @@ -66,7 +66,7 @@ function commachecker() # Return true if perf_event_paranoid is > $1 and not running as root. function ParanoidAndNotRoot() { - [ $(id -u) != 0 ] && [ $(cat /proc/sys/kernel/perf_event_paranoid) -gt $1 ] + [ "$(id -u)" != 0 ] && [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt $1 ] } check_no_args() -- cgit v1.2.3 From 0ed4b531e7da1193fa10786672f28d7734eb06ec Mon Sep 17 00:00:00 2001 From: Anushree Mathur Date: Tue, 13 Jun 2023 22:11:35 +0530 Subject: perf tests test_arm_coresight: Shellcheck fixes Fixed the following shellcheck issues in test_arm_coresight.sh file: In tools/perf/tests/shell/test_arm_coresight.sh line 31: trap - exit term int ^--^ SC2039: In POSIX sh, using lower/mixed case for signal names is undefined. ^--^ SC2039: In POSIX sh, using lower/mixed case for signal names is undefined. ^-^ SC2039: In POSIX sh, using lower/mixed case for signal names is undefined. In tools/perf/tests/shell/test_arm_coresight.sh line 35: trap cleanup_files exit term int ^--^ SC2039: In POSIX sh, using lower/mixed case for signal names is undefined. ^--^ SC2039: In POSIX sh, using lower/mixed case for signal names is undefined. ^-^ SC2039: In POSIX sh, using lower/mixed case for signal names is undefined. In tools/perf/tests/shell/test_arm_coresight.sh line 92: if [ $? -eq 0 -a -e "$1/enable_sink" ]; then ^-- SC2166: Prefer [ p ] && [ q ] as [ p -a q ] is not well defined. Fixed above warnings by: 1)Capitalize signals(INT, TERM, EXIT) to avoid mixed/lower case naming of signals. 2)Expression [p -a q] was not defined,changed it to [p] && [q] to avoid the ambiguity as this is older format using -a or -o ,now we use [p] && [q] in place of [p -a q] and [p] || [q] in place of [p -o q]. Result after fixing the issues: shell$ shellcheck -S warning test_arm_coresight.sh shell$ Signed-off-by: Anushree Mathur Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-8-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_coresight.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh index 482009e17bda..f1bf5621160f 100755 --- a/tools/perf/tests/shell/test_arm_coresight.sh +++ b/tools/perf/tests/shell/test_arm_coresight.sh @@ -28,11 +28,11 @@ cleanup_files() rm -f ${perfdata} rm -f ${file} rm -f "${perfdata}.old" - trap - exit term int + trap - EXIT TERM INT exit $glb_err } -trap cleanup_files exit term int +trap cleanup_files EXIT TERM INT record_touch_file() { echo "Recording trace (only user mode) with path: CPU$2 => $1" @@ -89,7 +89,7 @@ is_device_sink() { # cannot support perf PMU. echo "$1" | grep -E -q -v "tpiu" - if [ $? -eq 0 -a -e "$1/enable_sink" ]; then + if [ $? -eq 0 ] && [ -e "$1/enable_sink" ]; then pmu_dev="/sys/bus/event_source/devices/cs_etm/sinks/$2" -- cgit v1.2.3 From a6bdb815ad60f35f581ee0b48a886f7e451e34a3 Mon Sep 17 00:00:00 2001 From: Barnali Guha Thakurata Date: Tue, 13 Jun 2023 22:11:36 +0530 Subject: perf tests stat_all_metrics: Fix shellcheck warning SC2076 Fixed shellcheck warning SC2076 in stat_all_metrics.sh. Before the patch: shell$ shellcheck stat_all_metrics.sh In stat_all_metrics.sh line 9: if [[ "$result" =~ "${m:0:50}" ]] || [[ "$result" =~ "" ]] ^---------^ SC2076: Don't quote right-hand side of =~, it'll match literally rather than as a regex. In stat_all_metrics.sh line 15: if [[ "$result" =~ "${m:0:50}" ]] ^---------^ SC2076: Don't quote right-hand side of =~, it'll match literally rather than as a regex. In stat_all_metrics.sh line 22: if [[ "$result" =~ "${m:0:50}" ]] ^---------^ SC2076: Don't quote right-hand side of =~, it'll match literally rather than as a regex. After the patch: shell$ shellcheck stat_all_metrics.sh shell$ Signed-off-by: Barnali Guha Thakurata Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-9-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_all_metrics.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/stat_all_metrics.sh b/tools/perf/tests/shell/stat_all_metrics.sh index 22e9cb294b40..54774525e18a 100755 --- a/tools/perf/tests/shell/stat_all_metrics.sh +++ b/tools/perf/tests/shell/stat_all_metrics.sh @@ -6,20 +6,20 @@ err=0 for m in $(perf list --raw-dump metrics); do echo "Testing $m" result=$(perf stat -M "$m" true 2>&1) - if [[ "$result" =~ "${m:0:50}" ]] || [[ "$result" =~ "" ]] + if [[ "$result" =~ ${m:0:50} ]] || [[ "$result" =~ "" ]] then continue fi # Failed so try system wide. result=$(perf stat -M "$m" -a sleep 0.01 2>&1) - if [[ "$result" =~ "${m:0:50}" ]] + if [[ "$result" =~ ${m:0:50} ]] then continue fi # Failed again, possibly the workload was too small so retry with something # longer. result=$(perf stat -M "$m" perf bench internals synthesize 2>&1) - if [[ "$result" =~ "${m:0:50}" ]] + if [[ "$result" =~ ${m:0:50} ]] then continue fi -- cgit v1.2.3 From 9694dfe0a3fc81309f4c0a9a6a5f99b64caa851a Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Tue, 13 Jun 2023 22:11:37 +0530 Subject: perf tests test_task_analyzer: Fix shellcheck issues Fixed the following shellcheck issues in test_task_analyzer.sh file: SC2086: Double quote to prevent globbing and word splitting warnings in shell-check. Fixes the following shellcheck issues, SC2086: Double quote to prevent globbing and word splitting warnings in shell-check. Before Patch: $ shellcheck ./test_task_analyzer.sh | grep "SC2086" | ... In ./test_task_analyzer.sh line 13: SC2086: Double quote to prevent globbing and word splitting. In ./test_task_analyzer.sh line 24: SC2086: Double quote to prevent globbing and word splitting. In ./test_task_analyzer.sh line 39: SC2086: Double quote to prevent globbing and word splitting. After Patch: $ shellcheck ./test_task_analyzer.sh | grep -i "SC2086" None perf test result after patch: PASS: "test_basic" PASS: "test_ns_rename" PASS: "test_ms_filtertasks_highlight" PASS: "test_extended_times_timelimit_limittasks" PASS: "test_summary" PASS: "test_summaryextended" PASS: "test_summaryonly" PASS: "test_extended_times_summary_ns" PASS: "test_extended_times_summary_ns" PASS: "test_csv" PASS: "test_csvsummary" PASS: "test_csv_extended_times" PASS: "test_csvsummary_extended" Signed-off-by: Aboorva Devarajan Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-10-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_task_analyzer.sh | 54 ++++++++++++++-------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index a98e4ab66040..4264b54b654b 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -10,7 +10,7 @@ cleanup() { rm -f perf.data.old rm -f csv rm -f csvsummary - rm -rf $tmpdir + rm -rf "$tmpdir" trap - exit term int } @@ -21,7 +21,7 @@ trap_cleanup() { trap trap_cleanup exit term int report() { - if [ $1 = 0 ]; then + if [ "$1" = 0 ]; then echo "PASS: \"$2\"" else echo "FAIL: \"$2\" Error message: \"$3\"" @@ -36,11 +36,11 @@ check_exec_0() { } find_str_or_fail() { - grep -q "$1" $2 - if [ $? != 0 ]; then - report 1 $3 "Failed to find required string:'${1}'." + grep -q "$1" "$2" + if [ "$?" != 0 ]; then + report 1 "$3" "Failed to find required string:'${1}'." else - report 0 $3 + report 0 "$3" fi } @@ -52,86 +52,86 @@ prepare_perf_data() { # check standard inkvokation with no arguments test_basic() { out="$tmpdir/perf.out" - perf script report task-analyzer > $out + perf script report task-analyzer > "$out" check_exec_0 "perf" - find_str_or_fail "Comm" $out ${FUNCNAME[0]} + find_str_or_fail "Comm" "$out" "${FUNCNAME[0]}" } test_ns_rename(){ out="$tmpdir/perf.out" - perf script report task-analyzer --ns --rename-comms-by-tids 0:random > $out + perf script report task-analyzer --ns --rename-comms-by-tids 0:random > "$out" check_exec_0 "perf" - find_str_or_fail "Comm" $out ${FUNCNAME[0]} + find_str_or_fail "Comm" "$out" "${FUNCNAME[0]}" } test_ms_filtertasks_highlight(){ out="$tmpdir/perf.out" perf script report task-analyzer --ms --filter-tasks perf --highlight-tasks perf \ - > $out + > "$out" check_exec_0 "perf" - find_str_or_fail "Comm" $out ${FUNCNAME[0]} + find_str_or_fail "Comm" "$out" "${FUNCNAME[0]}" } test_extended_times_timelimit_limittasks() { out="$tmpdir/perf.out" perf script report task-analyzer --extended-times --time-limit :99999 \ - --limit-to-tasks perf > $out + --limit-to-tasks perf > "$out" check_exec_0 "perf" - find_str_or_fail "Out-Out" $out ${FUNCNAME[0]} + find_str_or_fail "Out-Out" "$out" "${FUNCNAME[0]}" } test_summary() { out="$tmpdir/perf.out" - perf script report task-analyzer --summary > $out + perf script report task-analyzer --summary > "$out" check_exec_0 "perf" - find_str_or_fail "Summary" $out ${FUNCNAME[0]} + find_str_or_fail "Summary" "$out" "${FUNCNAME[0]}" } test_summaryextended() { out="$tmpdir/perf.out" - perf script report task-analyzer --summary-extended > $out + perf script report task-analyzer --summary-extended > "$out" check_exec_0 "perf" - find_str_or_fail "Inter Task Times" $out ${FUNCNAME[0]} + find_str_or_fail "Inter Task Times" "$out" "${FUNCNAME[0]}" } test_summaryonly() { out="$tmpdir/perf.out" - perf script report task-analyzer --summary-only > $out + perf script report task-analyzer --summary-only > "$out" check_exec_0 "perf" - find_str_or_fail "Summary" $out ${FUNCNAME[0]} + find_str_or_fail "Summary" "$out" "${FUNCNAME[0]}" } test_extended_times_summary_ns() { out="$tmpdir/perf.out" - perf script report task-analyzer --extended-times --summary --ns > $out + perf script report task-analyzer --extended-times --summary --ns > "$out" check_exec_0 "perf" - find_str_or_fail "Out-Out" $out ${FUNCNAME[0]} - find_str_or_fail "Summary" $out ${FUNCNAME[0]} + find_str_or_fail "Out-Out" "$out" "${FUNCNAME[0]}" + find_str_or_fail "Summary" "$out" "${FUNCNAME[0]}" } test_csv() { perf script report task-analyzer --csv csv > /dev/null check_exec_0 "perf" - find_str_or_fail "Comm;" csv ${FUNCNAME[0]} + find_str_or_fail "Comm;" csv "${FUNCNAME[0]}" } test_csv_extended_times() { perf script report task-analyzer --csv csv --extended-times > /dev/null check_exec_0 "perf" - find_str_or_fail "Out-Out;" csv ${FUNCNAME[0]} + find_str_or_fail "Out-Out;" csv "${FUNCNAME[0]}" } test_csvsummary() { perf script report task-analyzer --csv-summary csvsummary > /dev/null check_exec_0 "perf" - find_str_or_fail "Comm;" csvsummary ${FUNCNAME[0]} + find_str_or_fail "Comm;" csvsummary "${FUNCNAME[0]}" } test_csvsummary_extended() { perf script report task-analyzer --csv-summary csvsummary --summary-extended \ >/dev/null check_exec_0 "perf" - find_str_or_fail "Out-Out;" csvsummary ${FUNCNAME[0]} + find_str_or_fail "Out-Out;" csvsummary "${FUNCNAME[0]}" } prepare_perf_data -- cgit v1.2.3 From e0da03c7b16b466750f0bd91865a2a000f1422b7 Mon Sep 17 00:00:00 2001 From: Abhirup Deb Date: Tue, 13 Jun 2023 22:11:38 +0530 Subject: perf tests test_arm_spe: Address shellcheck warnings about signal name case Running shellcheck -S on test_arm_spe.sh throws below warnings: #shellcheck -S warning tests/shell/test_arm_spe.sh In tests/shell/test_arm_spe.sh line 30: trap cleanup_files exit term int ^--^ SC3049 (warning): In POSIX sh, using lower/mixed case for signal names is undefined. ^--^ SC3049 (warning): In POSIX sh, using lower/mixed case for signal names is undefined. ^-^ SC3049 (warning): In POSIX sh, using lower/mixed case for signal names is undefined. Fixed this issue by using uppercase for "EXIT", "TERM" and "INIT" signals to avoid using lower/mixed case for signal names as input. Signed-off-by: Abhirup Deb Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-11-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Mukesh Chaurasiya Signed-off-by: Ojaswin Mujoo Signed-off-by: Piyush Sachdeva Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_spe.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_arm_spe.sh b/tools/perf/tests/shell/test_arm_spe.sh index aa094d71f5b4..03d5c7d12ee5 100755 --- a/tools/perf/tests/shell/test_arm_spe.sh +++ b/tools/perf/tests/shell/test_arm_spe.sh @@ -27,7 +27,7 @@ cleanup_files() exit $glb_err } -trap cleanup_files exit term int +trap cleanup_files EXIT TERM INT arm_spe_report() { if [ $2 = 0 ]; then -- cgit v1.2.3 From fa33cbe26683607f69ed3b6885356e94fadc5ca2 Mon Sep 17 00:00:00 2001 From: Abhirup Deb Date: Tue, 13 Jun 2023 22:11:39 +0530 Subject: perf tests lock_contention: Fix shellscript errors Use quotes around variables to prevent POSIX word expansion, use uppercase for signals(INT, TERM, EXIT) to avoid mixed/lower case naming of signals and replace "==" with "=" as "==" is not supported by POSIX shell. Signed-off-by: Abhirup Deb Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-12-atrajeev@linux.vnet.ibm.com Signed-off-by: Anushree Mathur Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lock_contention.sh | 70 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh index be5fcafb26aa..f2cc187b6186 100755 --- a/tools/perf/tests/shell/lock_contention.sh +++ b/tools/perf/tests/shell/lock_contention.sh @@ -11,14 +11,14 @@ result=$(mktemp /tmp/__perf_test.result.XXXXX) cleanup() { rm -f ${perfdata} rm -f ${result} - trap - exit term int + trap - EXIT TERM INT } trap_cleanup() { cleanup exit ${err} } -trap trap_cleanup exit term int +trap trap_cleanup EXIT TERM INT check() { if [ `id -u` != 0 ]; then @@ -40,8 +40,8 @@ test_record() perf lock record -o ${perfdata} -- perf bench sched messaging > /dev/null 2>&1 # the output goes to the stderr and we expect only 1 output (-E 1) perf lock contention -i ${perfdata} -E 1 -q 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] Recorded result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -58,8 +58,8 @@ test_bpf() # the perf lock contention output goes to the stderr perf lock con -a -b -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] BPF result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -70,8 +70,8 @@ test_record_concurrent() echo "Testing perf lock record and perf lock contention at the same time" perf lock record -o- -- perf bench sched messaging 2> /dev/null | \ perf lock contention -i- -E 1 -q 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] Recorded result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -81,8 +81,8 @@ test_aggr_task() { echo "Testing perf lock contention --threads" perf lock contention -i ${perfdata} -t -E 1 -q 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] Recorded result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -93,8 +93,8 @@ test_aggr_task() # the perf lock contention output goes to the stderr perf lock con -a -b -t -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] BPF result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -104,8 +104,8 @@ test_aggr_addr() { echo "Testing perf lock contention --lock-addr" perf lock contention -i ${perfdata} -l -E 1 -q 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] Recorded result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -116,8 +116,8 @@ test_aggr_addr() # the perf lock contention output goes to the stderr perf lock con -a -b -l -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] BPF result count is not 1:" $(cat "${result}" | wc -l) + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)" err=1 exit fi @@ -127,8 +127,8 @@ test_type_filter() { echo "Testing perf lock contention --type-filter (w/ spinlock)" perf lock contention -i ${perfdata} -Y spinlock -q 2> ${result} - if [ $(grep -c -v spinlock "${result}") != "0" ]; then - echo "[Fail] Recorded result should not have non-spinlocks:" $(cat "${result}") + if [ "$(grep -c -v spinlock "${result}")" != "0" ]; then + echo "[Fail] Recorded result should not have non-spinlocks:" "$(cat "${result}")" err=1 exit fi @@ -138,8 +138,8 @@ test_type_filter() fi perf lock con -a -b -Y spinlock -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(grep -c -v spinlock "${result}") != "0" ]; then - echo "[Fail] BPF result should not have non-spinlocks:" $(cat "${result}") + if [ "$(grep -c -v spinlock "${result}")" != "0" ]; then + echo "[Fail] BPF result should not have non-spinlocks:" "$(cat "${result}")" err=1 exit fi @@ -149,7 +149,7 @@ test_lock_filter() { echo "Testing perf lock contention --lock-filter (w/ tasklist_lock)" perf lock contention -i ${perfdata} -l -q 2> ${result} - if [ $(grep -c tasklist_lock "${result}") != "1" ]; then + if [ "$(grep -c tasklist_lock "${result}")" != "1" ]; then echo "[Skip] Could not find 'tasklist_lock'" return fi @@ -159,8 +159,8 @@ test_lock_filter() # find out the type of tasklist_lock local type=$(head -1 "${result}" | awk '{ print $8 }' | sed -e 's/:.*//') - if [ $(grep -c -v "${type}" "${result}") != "0" ]; then - echo "[Fail] Recorded result should not have non-${type} locks:" $(cat "${result}") + if [ "$(grep -c -v "${type}" "${result}")" != "0" ]; then + echo "[Fail] Recorded result should not have non-${type} locks:" "$(cat "${result}")" err=1 exit fi @@ -170,8 +170,8 @@ test_lock_filter() fi perf lock con -a -b -L tasklist_lock -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(grep -c -v "${type}" "${result}") != "0" ]; then - echo "[Fail] BPF result should not have non-${type} locks:" $(cat "${result}") + if [ "$(grep -c -v "${type}" "${result}")" != "0" ]; then + echo "[Fail] BPF result should not have non-${type} locks:" "$(cat "${result}")" err=1 exit fi @@ -181,14 +181,14 @@ test_stack_filter() { echo "Testing perf lock contention --callstack-filter (w/ unix_stream)" perf lock contention -i ${perfdata} -v -q 2> ${result} - if [ $(grep -c unix_stream "${result}") == "0" ]; then + if [ "$(grep -c unix_stream "${result}")" = "0" ]; then echo "[Skip] Could not find 'unix_stream'" return fi perf lock contention -i ${perfdata} -E 1 -S unix_stream -q 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] Recorded result should have a lock from unix_stream:" $(cat "${result}") + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] Recorded result should have a lock from unix_stream:" "$(cat "${result}")" err=1 exit fi @@ -198,8 +198,8 @@ test_stack_filter() fi perf lock con -a -b -S unix_stream -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] BPF result should have a lock from unix_stream:" $(cat "${result}") + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] BPF result should have a lock from unix_stream:" "$(cat "${result}")" err=1 exit fi @@ -209,14 +209,14 @@ test_aggr_task_stack_filter() { echo "Testing perf lock contention --callstack-filter with task aggregation" perf lock contention -i ${perfdata} -v -q 2> ${result} - if [ $(grep -c unix_stream "${result}") == "0" ]; then + if [ "$(grep -c unix_stream "${result}")" = "0" ]; then echo "[Skip] Could not find 'unix_stream'" return fi perf lock contention -i ${perfdata} -t -E 1 -S unix_stream -q 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] Recorded result should have a task from unix_stream:" $(cat "${result}") + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] Recorded result should have a task from unix_stream:" "$(cat "${result}")" err=1 exit fi @@ -226,8 +226,8 @@ test_aggr_task_stack_filter() fi perf lock con -a -b -t -S unix_stream -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result} - if [ $(cat "${result}" | wc -l) != "1" ]; then - echo "[Fail] BPF result should have a task from unix_stream:" $(cat "${result}") + if [ "$(cat "${result}" | wc -l)" != "1" ]; then + echo "[Fail] BPF result should have a task from unix_stream:" "$(cat "${result}")" err=1 exit fi -- cgit v1.2.3 From ed46a9994956b5ee53d62f848b1b69579201a7ec Mon Sep 17 00:00:00 2001 From: Samir Mulani Date: Tue, 13 Jun 2023 22:11:40 +0530 Subject: perf tests shell: Fixed shellcheck warnings Fixed the shellcheck warnings in buildid.sh, record+probe_libc_inet_pton.sh and record+script_probe_vfs_getname.sh perf shell scripts: 1. Prefer [ p ] && [ q ] as [ p -a q ] is not well defined. 2. Prefer [ p ] || [ q ] as [ p -o q ] is not well defined. 3. Used * argument to avoid the argument mixes string and array 4. Resolved issue for variable refernce, where the variable is being used before it has been initialized. 5. Resolved word splitting issue (syntax error). 6. The "err" variable has been removed from buildid.sh since it is not used anywhere in the code. Signed-off-by: Samir Mulani Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-13-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/buildid.sh | 12 ++++++------ tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 6 +++--- tools/perf/tests/shell/record+script_probe_vfs_getname.sh | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/tests/shell/buildid.sh b/tools/perf/tests/shell/buildid.sh index 0ce22ea0a7f1..3383ca3399d4 100755 --- a/tools/perf/tests/shell/buildid.sh +++ b/tools/perf/tests/shell/buildid.sh @@ -83,12 +83,12 @@ check() # in case of pe-file.exe file echo $1 | grep ".exe" if [ $? -eq 0 ]; then - if [ -x $1 -a ! -x $file ]; then + if [ -x $1 ] && [ ! -x $file ]; then echo "failed: file ${file} executable does not exist" exit 1 fi - if [ ! -x $file -a ! -e $file ]; then + if [ ! -x $file ] && [ ! -e $file ]; then echo "failed: file ${file} does not exist" exit 1 fi @@ -136,10 +136,10 @@ test_record() log_err=$(mktemp /tmp/perf.log.err.XXX) perf="perf --buildid-dir ${build_id_dir}" - echo "running: perf record $@" - ${perf} record --buildid-all -o ${data} $@ 1>${log_out} 2>${log_err} + echo "running: perf record $*" + ${perf} record --buildid-all -o ${data} "$@" 1>${log_out} 2>${log_err} if [ $? -ne 0 ]; then - echo "failed: record $@" + echo "failed: record $*" echo "see log: ${log_err}" exit 1 fi @@ -172,4 +172,4 @@ if [ ${run_pe} -eq 1 ]; then rm -r ${wineprefix} fi -exit ${err} +exit 0 diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index bbb5b3d185fa..0934fb0cd68f 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -10,8 +10,8 @@ # SPDX-License-Identifier: GPL-2.0 # Arnaldo Carvalho de Melo , 2017 -. $(dirname $0)/lib/probe.sh -. $(dirname $0)/lib/probe_vfs_getname.sh +. "$(dirname "$0")/lib/probe.sh" +. "$(dirname "$0")/lib/probe_vfs_getname.sh" libc=$(grep -w libc /proc/self/maps | head -1 | sed -r 's/.*[[:space:]](\/.*)/\1/g') nm -Dg $libc 2>/dev/null | fgrep -q inet_pton || exit 254 @@ -23,7 +23,7 @@ add_libc_inet_pton_event() { event_name=$(perf probe -f -x $libc -a inet_pton 2>&1 | tail -n +2 | head -n -5 | \ grep -P -o "$event_pattern(?=[[:space:]]\(on inet_pton in $libc\))") - if [ $? -ne 0 -o -z "$event_name" ] ; then + if [ $? -ne 0 ] || [ -z "$event_name" ] ; then printf "FAIL: could not add event\n" return 1 fi diff --git a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh index 1341437e1bd9..7f664f1889d9 100755 --- a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh +++ b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh @@ -9,11 +9,11 @@ # SPDX-License-Identifier: GPL-2.0 # Arnaldo Carvalho de Melo , 2017 -. $(dirname $0)/lib/probe.sh +. "$(dirname "$0")/lib/probe.sh" skip_if_no_perf_probe || exit 2 -. $(dirname $0)/lib/probe_vfs_getname.sh +. "$(dirname "$0")/lib/probe_vfs_getname.sh" record_open_file() { echo "Recording open file:" -- cgit v1.2.3 From 3b3bf0d112163524e61d1d6456c65e2157aade74 Mon Sep 17 00:00:00 2001 From: Geetika Date: Tue, 13 Jun 2023 22:11:41 +0530 Subject: perf tests test_brstack.sh: Fix all POSIX sh warnings Fix all the POSIX sh warnings in perf shell test test_brstack.sh Warnings fixed : * In POSIX sh, using lower/mixed case for signal names is undefined. Correcting that in this script. * In POSIX sh, 'local' is undefined. local is supported in many shells, but it's not in POSIX. In POSIX sh, you can adopt some convention to avoid accidentally overwriting variables names, e.g. prefixing with the function name, that is what I have done here. Signed-off-by: Geetika Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-14-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_brstack.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh index 1c49d8293003..09908d71c994 100755 --- a/tools/perf/tests/shell/test_brstack.sh +++ b/tools/perf/tests/shell/test_brstack.sh @@ -18,7 +18,7 @@ cleanup() { rm -rf $TMPDIR } -trap cleanup exit term int +trap cleanup EXIT TERM INT test_user_branches() { echo "Testing user branch stack sampling" @@ -47,17 +47,17 @@ test_user_branches() { # first argument is the argument passed to "--branch-stack ,save_type,u" # second argument are the expected branch types for the given filter test_filter() { - local filter=$1 - local expect=$2 + test_filter_filter=$1 + test_filter_expect=$2 - echo "Testing branch stack filtering permutation ($filter,$expect)" + echo "Testing branch stack filtering permutation ($test_filter_filter,$test_filter_expect)" - perf record -o $TMPDIR/perf.data --branch-filter $filter,save_type,u -- ${TESTPROG} > /dev/null 2>&1 + perf record -o $TMPDIR/perf.data --branch-filter $test_filter_filter,save_type,u -- ${TESTPROG} > /dev/null 2>&1 perf script -i $TMPDIR/perf.data --fields brstack | xargs -n1 > $TMPDIR/perf.script # fail if we find any branch type that doesn't match any of the expected ones # also consider UNKNOWN branch types (-) - if grep -E -vm1 "^[^ ]*/($expect|-|( *))/.*$" $TMPDIR/perf.script; then + if grep -E -vm1 "^[^ ]*/($test_filter_expect|-|( *))/.*$" $TMPDIR/perf.script; then return 1 fi } -- cgit v1.2.3 From c4a1a7763da3a7345f338e3666e3f7749b96b734 Mon Sep 17 00:00:00 2001 From: Spoorthy S Date: Tue, 13 Jun 2023 22:11:42 +0530 Subject: perf tests stat+shadow_stat.sh: Fix all POSIX sh warnings found using shellcheck Running shellcheck -S on stat+shadow_stat.sh testcase, generates SC2046 and SC2034 warnings, $ shellcheck -S warning tests/shell/stat+shadow_stat.sh res=`printf "%.2f" $(echo "scale=6; $num / $cyc" | bc -q)` : Quote this to prevent word splitting To address the POSIX shell warnings used quotes in the printf expressions, to prevent word splitting. Signed-off-by: Spoorthy S Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-15-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+shadow_stat.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/stat+shadow_stat.sh b/tools/perf/tests/shell/stat+shadow_stat.sh index e6e35fc6c882..0e9cba84e757 100755 --- a/tools/perf/tests/shell/stat+shadow_stat.sh +++ b/tools/perf/tests/shell/stat+shadow_stat.sh @@ -33,7 +33,7 @@ test_global_aggr() fi # use printf for rounding and a leading zero - res=`printf "%.2f" $(echo "scale=6; $num / $cyc" | bc -q)` + res=`printf "%.2f" "$(echo "scale=6; $num / $cyc" | bc -q)"` if [ "$ipc" != "$res" ]; then echo "IPC is different: $res != $ipc ($num / $cyc)" exit 1 @@ -67,7 +67,7 @@ test_no_aggr() fi # use printf for rounding and a leading zero - res=`printf "%.2f" $(echo "scale=6; $num / $cyc" | bc -q)` + res=`printf "%.2f" "$(echo "scale=6; $num / $cyc" | bc -q)"` if [ "$ipc" != "$res" ]; then echo "IPC is different for $cpu: $res != $ipc ($num / $cyc)" exit 1 -- cgit v1.2.3 From 5c4396efb53ef07d046a2e9456b240880e0c3076 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Tue, 13 Jun 2023 22:11:43 +0530 Subject: perf tests task_analyzer: Fix bad substitution ${$1} ${$1} gives bad substitution error on sh, bash, and zsh. This seems like a typo, and this patch modifies it to $1, since that is what it's usage looks like from wherever `check_exec_0` is called. This issue due to ${$1} caused all function calls to give error in `find_str_or_fail` line, and so no test runs completely. But 'perf test "perf script task-analyzer tests"' wrongly reports that tests passed with the status OK, which is wrong considering the tests didn't even run completely Fixes: e8478b84d6ba9ccf ("perf test: add new task-analyzer tests") Signed-off-by: Aditya Gupta Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Cc: Disha Goel Cc: Hagen Paul Pfeifer Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Petar Gligoric Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-16-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_task_analyzer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index 4264b54b654b..84ab7e7f57d5 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -31,7 +31,7 @@ report() { check_exec_0() { if [ $? != 0 ]; then - report 1 "invokation of ${$1} command failed" + report 1 "invocation of $1 command failed" fi } -- cgit v1.2.3 From b8e55fde9f663bd582d8f0b673fa8735f0dcca47 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Tue, 13 Jun 2023 22:11:44 +0530 Subject: perf tests task_analyzer: Print command that failed instead of just "perf" Instead of printing "perf command failed" everytime, print the exact command that run earlier Signed-off-by: Aditya Gupta Acked-by: Hagen Paul Pfeifer Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-17-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_task_analyzer.sh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index 84ab7e7f57d5..b094eeb3bf66 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -53,14 +53,14 @@ prepare_perf_data() { test_basic() { out="$tmpdir/perf.out" perf script report task-analyzer > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer" find_str_or_fail "Comm" "$out" "${FUNCNAME[0]}" } test_ns_rename(){ out="$tmpdir/perf.out" perf script report task-analyzer --ns --rename-comms-by-tids 0:random > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --ns --rename-comms-by-tids 0:random" find_str_or_fail "Comm" "$out" "${FUNCNAME[0]}" } @@ -68,7 +68,7 @@ test_ms_filtertasks_highlight(){ out="$tmpdir/perf.out" perf script report task-analyzer --ms --filter-tasks perf --highlight-tasks perf \ > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --ms --filter-tasks perf --highlight-tasks perf" find_str_or_fail "Comm" "$out" "${FUNCNAME[0]}" } @@ -76,61 +76,61 @@ test_extended_times_timelimit_limittasks() { out="$tmpdir/perf.out" perf script report task-analyzer --extended-times --time-limit :99999 \ --limit-to-tasks perf > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --extended-times --time-limit :99999 --limit-to-tasks perf" find_str_or_fail "Out-Out" "$out" "${FUNCNAME[0]}" } test_summary() { out="$tmpdir/perf.out" perf script report task-analyzer --summary > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --summary" find_str_or_fail "Summary" "$out" "${FUNCNAME[0]}" } test_summaryextended() { out="$tmpdir/perf.out" perf script report task-analyzer --summary-extended > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --summary-extended" find_str_or_fail "Inter Task Times" "$out" "${FUNCNAME[0]}" } test_summaryonly() { out="$tmpdir/perf.out" perf script report task-analyzer --summary-only > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --summary-only" find_str_or_fail "Summary" "$out" "${FUNCNAME[0]}" } test_extended_times_summary_ns() { out="$tmpdir/perf.out" perf script report task-analyzer --extended-times --summary --ns > "$out" - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --extended-times --summary --ns" find_str_or_fail "Out-Out" "$out" "${FUNCNAME[0]}" find_str_or_fail "Summary" "$out" "${FUNCNAME[0]}" } test_csv() { perf script report task-analyzer --csv csv > /dev/null - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --csv csv" find_str_or_fail "Comm;" csv "${FUNCNAME[0]}" } test_csv_extended_times() { perf script report task-analyzer --csv csv --extended-times > /dev/null - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --csv csv --extended-times" find_str_or_fail "Out-Out;" csv "${FUNCNAME[0]}" } test_csvsummary() { perf script report task-analyzer --csv-summary csvsummary > /dev/null - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --csv-summary csvsummary" find_str_or_fail "Comm;" csvsummary "${FUNCNAME[0]}" } test_csvsummary_extended() { perf script report task-analyzer --csv-summary csvsummary --summary-extended \ >/dev/null - check_exec_0 "perf" + check_exec_0 "perf script report task-analyzer --csv-summary csvsummary --summary-extended" find_str_or_fail "Out-Out;" csvsummary "${FUNCNAME[0]}" } -- cgit v1.2.3 From c3ac3b0779770acd3ad7eecb5099ab4419ef2e2e Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Tue, 13 Jun 2023 22:11:45 +0530 Subject: perf tests task_analyzer: Skip tests if no libtraceevent support Test "perf script task-analyzer tests" fails in environment with missing libtraceevent support, as perf record fails to create the perf.data file, which further tests depend on. Instead, when perf is not compiled with libtraceevent support, skip those tests instead of failing them, by checking the output of `perf record --dry-run` to see if it prints the error "libtraceevent is necessary for tracepoint support" For the following output, perf compiled with: `make NO_LIBTRACEEVENT=1` Before the patch: 108: perf script task-analyzer tests : test child forked, pid 24105 failed to open perf.data: No such file or directory (try 'perf record' first) FAIL: "invokation of perf script report task-analyzer command failed" Error message: "" FAIL: "test_basic" Error message: "Failed to find required string:'Comm'." failed to open perf.data: No such file or directory (try 'perf record' first) FAIL: "invokation of perf script report task-analyzer --ns --rename-comms-by-tids 0:random command failed" Error message: "" FAIL: "test_ns_rename" Error message: "Failed to find required string:'Comm'." failed to open perf.data: No such file or directory (try 'perf record' first) <...> perf script task-analyzer tests: FAILED! With this patch, the script instead returns 2 signifying SKIP, and after the patch: 108: perf script task-analyzer tests : test child forked, pid 26010 libtraceevent is necessary for tracepoint support WARN: Skipping tests. No libtraceevent support test child finished with -2 perf script task-analyzer tests: Skip Fixes: e8478b84d6ba9ccf ("perf test: Add new task-analyzer tests") Signed-off-by: Aditya Gupta Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: John Garry Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Petar Gligoric Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230613164145.50488-18-atrajeev@linux.vnet.ibm.com Signed-off-by: Athira Rajeev Signed-off-by: Kajol Jain Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_task_analyzer.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index b094eeb3bf66..59785dfc11f8 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -44,9 +44,20 @@ find_str_or_fail() { fi } +# check if perf is compiled with libtraceevent support +skip_no_probe_record_support() { + perf record -e "sched:sched_switch" -a -- sleep 1 2>&1 | grep "libtraceevent is necessary for tracepoint support" && return 2 + return 0 +} + prepare_perf_data() { # 1s should be sufficient to catch at least some switches perf record -e sched:sched_switch -a -- sleep 1 > /dev/null 2>&1 + # check if perf data file got created in above step. + if [ ! -e "perf.data" ]; then + printf "FAIL: perf record failed to create \"perf.data\" \n" + return 1 + fi } # check standard inkvokation with no arguments @@ -134,6 +145,13 @@ test_csvsummary_extended() { find_str_or_fail "Out-Out;" csvsummary "${FUNCNAME[0]}" } +skip_no_probe_record_support +err=$? +if [ $err -ne 0 ]; then + echo "WARN: Skipping tests. No libtraceevent support" + cleanup + exit $err +fi prepare_perf_data test_basic test_ns_rename -- cgit v1.2.3 From e2595550177d8ea42083c9fd8e8a9d4acd5604ec Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 7 Jun 2023 09:26:59 -0700 Subject: pert tests: Support metricgroup perf stat JSON output A new field metricgroup has been added in the perf stat JSON output. Support it in the test case. Signed-off-by: Kan Liang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230607162700.3234712-8-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/perf_json_output_lint.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py index b81582a89d36..5e9bd68c83fe 100644 --- a/tools/perf/tests/shell/lib/perf_json_output_lint.py +++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py @@ -55,6 +55,7 @@ def check_json_output(expected_items): 'interval': lambda x: isfloat(x), 'metric-unit': lambda x: True, 'metric-value': lambda x: isfloat(x), + 'metricgroup': lambda x: True, 'node': lambda x: True, 'pcnt-running': lambda x: isfloat(x), 'socket': lambda x: True, @@ -70,6 +71,8 @@ def check_json_output(expected_items): # values and possibly other prefixes like interval, core and # aggregate-number. pass + elif count != expected_items and count >= 1 and count <= 5 and 'metricgroup' in item: + pass elif count != expected_items: raise RuntimeError(f'wrong number of fields. counted {count} expected {expected_items}' f' in \'{item}\'') -- cgit v1.2.3 From 556fd664d666c0cc9d5b0d52851b0480c51cf59e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 7 Jun 2023 09:26:56 -0700 Subject: perf vendor events arm64: Add default tags into topdown L1 metrics Add the default tags for ARM as well. Signed-off-by: Kan Liang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230607162700.3234712-5-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/arm64/sbsa.json | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/pmu-events/arch/arm64/sbsa.json b/tools/perf/pmu-events/arch/arm64/sbsa.json index f678c37ea9c3..f90b338261ac 100644 --- a/tools/perf/pmu-events/arch/arm64/sbsa.json +++ b/tools/perf/pmu-events/arch/arm64/sbsa.json @@ -2,28 +2,32 @@ { "MetricExpr": "stall_slot_frontend / (#slots * cpu_cycles)", "BriefDescription": "Frontend bound L1 topdown metric", - "MetricGroup": "TopdownL1", + "DefaultMetricgroupName": "TopdownL1", + "MetricGroup": "Default;TopdownL1", "MetricName": "frontend_bound", "ScaleUnit": "100%" }, { "MetricExpr": "(1 - op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", "BriefDescription": "Bad speculation L1 topdown metric", - "MetricGroup": "TopdownL1", + "DefaultMetricgroupName": "TopdownL1", + "MetricGroup": "Default;TopdownL1", "MetricName": "bad_speculation", "ScaleUnit": "100%" }, { "MetricExpr": "(op_retired / op_spec) * (1 - stall_slot / (#slots * cpu_cycles))", "BriefDescription": "Retiring L1 topdown metric", - "MetricGroup": "TopdownL1", + "DefaultMetricgroupName": "TopdownL1", + "MetricGroup": "Default;TopdownL1", "MetricName": "retiring", "ScaleUnit": "100%" }, { "MetricExpr": "stall_slot_backend / (#slots * cpu_cycles)", "BriefDescription": "Backend Bound L1 topdown metric", - "MetricGroup": "TopdownL1", + "DefaultMetricgroupName": "TopdownL1", + "MetricGroup": "Default;TopdownL1", "MetricName": "backend_bound", "ScaleUnit": "100%" } -- cgit v1.2.3 From 99d4850062a84564f36923764bb93935ef2ed108 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 13 Jun 2023 16:54:16 -0700 Subject: perf tool x86: Fix perf_env memory leak Found by leak sanitizer: ``` ==1632594==ERROR: LeakSanitizer: detected memory leaks Direct leak of 21 byte(s) in 1 object(s) allocated from: #0 0x7f2953a7077b in __interceptor_strdup ../../../../src/libsanitizer/asan/asan_interceptors.cpp:439 #1 0x556701d6fbbf in perf_env__read_cpuid util/env.c:369 #2 0x556701d70589 in perf_env__cpuid util/env.c:465 #3 0x55670204bba2 in x86__is_amd_cpu arch/x86/util/env.c:14 #4 0x5567020487a2 in arch__post_evsel_config arch/x86/util/evsel.c:83 #5 0x556701d8f78b in evsel__config util/evsel.c:1366 #6 0x556701ef5872 in evlist__config util/record.c:108 #7 0x556701cd6bcd in test__PERF_RECORD tests/perf-record.c:112 #8 0x556701cacd07 in run_test tests/builtin-test.c:236 #9 0x556701cacfac in test_and_print tests/builtin-test.c:265 #10 0x556701cadddb in __cmd_test tests/builtin-test.c:402 #11 0x556701caf2aa in cmd_test tests/builtin-test.c:559 #12 0x556701d3b557 in run_builtin tools/perf/perf.c:323 #13 0x556701d3bac8 in handle_internal_command tools/perf/perf.c:377 #14 0x556701d3be90 in run_argv tools/perf/perf.c:421 #15 0x556701d3c3f8 in main tools/perf/perf.c:537 #16 0x7f2952a46189 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 SUMMARY: AddressSanitizer: 21 byte(s) leaked in 1 allocation(s). ``` Fixes: f7b58cbdb3ff36eb ("perf mem/c2c: Add load store event mappings for AMD") Signed-off-by: Ian Rogers Acked-by: Ravi Bangoria Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20230613235416.1650755-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/env.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/util/env.c b/tools/perf/arch/x86/util/env.c index 33b87f8ac1cc..3e537ffb1353 100644 --- a/tools/perf/arch/x86/util/env.c +++ b/tools/perf/arch/x86/util/env.c @@ -13,7 +13,7 @@ bool x86__is_amd_cpu(void) perf_env__cpuid(&env); is_amd = env.cpuid && strstarts(env.cpuid, "AuthenticAMD") ? 1 : -1; - + perf_env__exit(&env); ret: return is_amd >= 1 ? true : false; } -- cgit v1.2.3 From f4c0d5309a3e5f16ca3c3854b1e719dace843e03 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 7 Jun 2023 23:18:11 -0700 Subject: tools api: Add simple timeout to io read In situations like reading from a pipe it can be useful to have a timeout so that the caller doesn't block indefinitely. Implement a simple one based on poll. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: http://lore.kernel.org/lkml/20230608061812.3715566-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/io.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/lib/api/io.h b/tools/lib/api/io.h index d5e8cf0dada0..9fc429d2852d 100644 --- a/tools/lib/api/io.h +++ b/tools/lib/api/io.h @@ -8,6 +8,7 @@ #define __API_IO__ #include +#include #include #include #include @@ -23,6 +24,8 @@ struct io { char *end; /* Currently accessed data pointer. */ char *data; + /* Read timeout, 0 implies no timeout. */ + int timeout_ms; /* Set true on when the end of file on read error. */ bool eof; }; @@ -35,6 +38,7 @@ static inline void io__init(struct io *io, int fd, io->buf = buf; io->end = buf; io->data = buf; + io->timeout_ms = 0; io->eof = false; } @@ -47,7 +51,29 @@ static inline int io__get_char(struct io *io) return -1; if (ptr == io->end) { - ssize_t n = read(io->fd, io->buf, io->buf_len); + ssize_t n; + + if (io->timeout_ms != 0) { + struct pollfd pfds[] = { + { + .fd = io->fd, + .events = POLLIN, + }, + }; + + n = poll(pfds, 1, io->timeout_ms); + if (n == 0) + errno = ETIMEDOUT; + if (n > 0 && !(pfds[0].revents & POLLIN)) { + errno = EIO; + n = -1; + } + if (n <= 0) { + io->eof = true; + return -1; + } + } + n = read(io->fd, io->buf, io->buf_len); if (n <= 0) { io->eof = true; -- cgit v1.2.3 From 701677b95764c06bb058c92be11c3a4ad25ab5f2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 7 Jun 2023 23:18:12 -0700 Subject: perf srcline: Add a timeout to reading from addr2line addr2line may fail to send expected values causing perf to wait indefinitely. Add a 1 second timeout (twice the timeout for reading from /proc/pid/maps) so that such reads don't cause perf to appear to lock up. There are already checks that the file for addr2line contains a debug section but this isn't always sufficient. The problem was observed when a valid elf file would set the configuration for binutils addr2line, then a later read of vmlinux with ELF debug sections would cause a failing write/read which would block indefinitely. As a service to future readers, if the io hits eof or an error, cleanup the addr2line process. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20230608061812.3715566-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/config.c | 7 +++++-- tools/perf/util/srcline.c | 10 ++++++++-- tools/perf/util/srcline.h | 1 + 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index f340dc73db6d..46f144c46827 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -19,6 +19,7 @@ #include "util/llvm-utils.h" /* perf_llvm_config */ #include "util/stat.h" /* perf_stat__set_big_num */ #include "util/evsel.h" /* evsel__hw_names, evsel__use_bpf_counters */ +#include "util/srcline.h" /* addr2line_timeout_ms */ #include "build-id.h" #include "debug.h" #include "config.h" @@ -434,12 +435,14 @@ static int perf_buildid_config(const char *var, const char *value) return 0; } -static int perf_default_core_config(const char *var __maybe_unused, - const char *value __maybe_unused) +static int perf_default_core_config(const char *var, const char *value) { if (!strcmp(var, "core.proc-map-timeout")) proc_map_timeout = strtoul(value, NULL, 10); + if (!strcmp(var, "core.addr2line-timeout")) + addr2line_timeout_ms = strtoul(value, NULL, 10); + /* Add other config variables here. */ return 0; } diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index b27b4b3c391b..c013bcbdfd42 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -21,6 +21,8 @@ #include "symbol.h" #include "subcmd/run-command.h" +/* If addr2line doesn't return data for 1 second then timeout. */ +int addr2line_timeout_ms = 1 * 1000; bool srcline_full_filename; char *srcline__unknown = (char *)"??:0"; @@ -631,7 +633,7 @@ static int addr2line(const char *dso_name, u64 addr, int len; char buf[128]; ssize_t written; - struct io io; + struct io io = { .eof = false }; enum a2l_style a2l_style; if (!a2l) { @@ -670,7 +672,7 @@ static int addr2line(const char *dso_name, u64 addr, goto out; } io__init(&io, a2l->out, buf, sizeof(buf)); - + io.timeout_ms = addr2line_timeout_ms; switch (read_addr2line_record(&io, a2l_style, &record_function, &record_filename, &record_line_nr)) { case -1: @@ -741,6 +743,10 @@ static int addr2line(const char *dso_name, u64 addr, out: free(record_function); free(record_filename); + if (io.eof) { + dso->a2l = NULL; + addr2line_subprocess_cleanup(a2l); + } return ret; } diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index 167645bcff07..75010d39ea28 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -9,6 +9,7 @@ struct dso; struct symbol; +extern int addr2line_timeout_ms; extern bool srcline_full_filename; char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, bool show_sym, bool show_addr, u64 ip); -- cgit v1.2.3 From e90208e9ffe6cbeb3c14cba14082137bcb633ffe Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 14 Jun 2023 19:50:41 -0700 Subject: perf srcline: Fix handling of inline functions We write an address then a ',' to addr2line. With inline data we generally get back (// are my comments): 0x1234 // address foo // function name foo.c:123 // filename:line bar // function name bar.c:123 // filename:line 0x000000000000000 // sentinel address created by ',' ?? // unknown function name ??:0 // unknown filename:line The code was assuming the inline data also had the address, which is incorrect. This means the first inline function name (bar above) needs to be checked to see if it is the sentinel, otherwise to be treated as a function name. The regression was caused by the addition of addresses as the kernel is reporting a symbol at address 0 (used by GNU binutils when it interprets ','). Committer testing: Using: # perf trace --call-graph=dwarf -e lock:contention_* 1244.615 TaskCon~ller #/2645281 lock:contention_begin(lock_addr: 0xffff8e6748da5ab0, flags: 2) __preempt_count_dec_and_test (inlined) trace_contention_begin (inlined) trace_contention_begin (inlined) rwsem_down_read_slowpath ([kernel.kallsyms]) __preempt_count_dec_and_test (inlined) trace_contention_begin (inlined) trace_contention_begin (inlined) rwsem_down_read_slowpath ([kernel.kallsyms]) __down_read_common (inlined) __down_read (inlined) down_read ([kernel.kallsyms]) arch_static_branch (inlined) static_key_false (inlined) __mmap_lock_trace_acquire_returned (inlined) mmap_read_lock (inlined) do_user_addr_fault ([kernel.kallsyms]) arch_local_irq_disable (inlined) handle_page_fault (inlined) exc_page_fault ([kernel.kallsyms]) asm_exc_page_fault ([kernel.kallsyms]) [0x4def008] (/usr/lib64/firefox/libxul.so) 1244.619 TaskCon~ller #/2645281 lock:contention_end(lock_addr: 0xffff8e6748da5ab0) __preempt_count_dec_and_test (inlined) trace_contention_end (inlined) trace_contention_end (inlined) rwsem_down_read_slowpath ([kernel.kallsyms]) __preempt_count_dec_and_test (inlined) trace_contention_end (inlined) trace_contention_end (inlined) rwsem_down_read_slowpath ([kernel.kallsyms]) __down_read_common (inlined) __down_read (inlined) down_read ([kernel.kallsyms]) arch_static_branch (inlined) static_key_false (inlined) __mmap_lock_trace_acquire_returned (inlined) mmap_read_lock (inlined) do_user_addr_fault ([kernel.kallsyms]) arch_local_irq_disable (inlined) handle_page_fault (inlined) exc_page_fault ([kernel.kallsyms]) asm_exc_page_fault ([kernel.kallsyms]) Fixes: 8dc26b6f718a8118 ("perf srcline: Make sentinel reading for binutils addr2line more robust") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: llvm@lists.linux.dev Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Tom Rix Link: https://lore.kernel.org/r/20230615025041.1982072-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 136 +++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 56 deletions(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index c013bcbdfd42..034b496df297 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -389,7 +389,7 @@ static int filename_split(char *filename, unsigned int *line_nr) *line_nr = strtoul(sep, NULL, 0); return 1; } - + pr_debug("addr2line missing ':' in filename split\n"); return 0; } @@ -465,10 +465,12 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char style = LLVM; cached = true; lines = 1; + pr_debug("Detected LLVM addr2line style\n"); } else if (ch == '0') { style = GNU_BINUTILS; cached = true; lines = 3; + pr_debug("Detected binutils addr2line style\n"); } else { if (!symbol_conf.disable_add2line_warn) { char *output = NULL; @@ -479,6 +481,7 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char __func__, dso_name); pr_warning("\t%c%s", ch, output); } + pr_debug("Unknown/broken addr2line style\n"); return BROKEN; } while (lines) { @@ -496,6 +499,9 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char static int read_addr2line_record(struct io *io, enum a2l_style style, + const char *dso_name, + u64 addr, + bool first, char **function, char **filename, unsigned int *line_nr) @@ -521,56 +527,62 @@ static int read_addr2line_record(struct io *io, *line_nr = 0; /* - * Read the first line. Without an error this will be either an address - * like 0x1234 or for llvm-addr2line the sentinal ',' character. + * Read the first line. Without an error this will be: + * - for the first line an address like 0x1234, + * - the binutils sentinel 0x0000000000000000, + * - the llvm-addr2line the sentinel ',' character, + * - the function name line for an inlined function. */ if (io__getline(io, &line, &line_len) < 0 || !line_len) goto error; - if (style == LLVM) { - if (line_len == 2 && line[0] == ',') { - zfree(&line); - return 0; - } - } else { + pr_debug("%s %s: addr2line read address for sentinel: %s", __func__, dso_name, line); + if (style == LLVM && line_len == 2 && line[0] == ',') { + /* Found the llvm-addr2line sentinel character. */ + zfree(&line); + return 0; + } else if (style == GNU_BINUTILS && (!first || addr != 0)) { int zero_count = 0, non_zero_count = 0; + /* + * Check for binutils sentinel ignoring it for the case the + * requested address is 0. + */ - /* The address should always start 0x. */ - if (line_len < 2 || line[0] != '0' || line[1] != 'x') - goto error; - - for (size_t i = 2; i < line_len; i++) { - if (line[i] == '0') - zero_count++; - else if (line[i] != '\n') - non_zero_count++; - } - if (!non_zero_count) { - int ch; - - if (!zero_count) { - /* Line was erroneous just '0x'. */ - goto error; + /* A given address should always start 0x. */ + if (line_len >= 2 || line[0] != '0' || line[1] != 'x') { + for (size_t i = 2; i < line_len; i++) { + if (line[i] == '0') + zero_count++; + else if (line[i] != '\n') + non_zero_count++; + } + if (!non_zero_count) { + int ch; + + if (first && !zero_count) { + /* Line was erroneous just '0x'. */ + goto error; + } + /* + * Line was 0x0..0, the sentinel for binutils. Remove + * the function and filename lines. + */ + zfree(&line); + do { + ch = io__get_char(io); + } while (ch > 0 && ch != '\n'); + do { + ch = io__get_char(io); + } while (ch > 0 && ch != '\n'); + return 0; } - /* - * Line was 0x0..0, the sentinel for binutils. Remove - * the function and filename lines. - */ - zfree(&line); - do { - ch = io__get_char(io); - } while (ch > 0 && ch != '\n'); - do { - ch = io__get_char(io); - } while (ch > 0 && ch != '\n'); - return 0; } } - - /* Read the second function name line. */ - if (io__getline(io, &line, &line_len) < 0 || !line_len) + /* Read the second function name line (if inline data then this is the first line). */ + if (first && (io__getline(io, &line, &line_len) < 0 || !line_len)) goto error; + pr_debug("%s %s: addr2line read line: %s", __func__, dso_name, line); if (function != NULL) *function = strdup(strim(line)); @@ -581,6 +593,7 @@ static int read_addr2line_record(struct io *io, if (io__getline(io, &line, &line_len) < 0 || !line_len) goto error; + pr_debug("%s %s: addr2line filename:number : %s", __func__, dso_name, line); if (filename_split(line, line_nr == NULL ? &dummy_line_nr : line_nr) == 0 && style == GNU_BINUTILS) { ret = 0; @@ -640,8 +653,7 @@ static int addr2line(const char *dso_name, u64 addr, if (!filename__has_section(dso_name, ".debug_line")) goto out; - dso->a2l = addr2line_subprocess_init(symbol_conf.addr2line_path, - dso_name); + dso->a2l = addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name); a2l = dso->a2l; } @@ -655,14 +667,13 @@ static int addr2line(const char *dso_name, u64 addr, goto out; /* - * Send our request and then *deliberately* send something that can't be interpreted as - * a valid address to ask addr2line about (namely, ","). This causes addr2line to first - * write out the answer to our request, in an unbounded/unknown number of records, and - * then to write out the lines "??" and "??:0", for GNU binutils, or "," for - * llvm-addr2line, so that we can detect when it has finished giving us anything - * useful. With GNU binutils, we have to be careful about the first record, though, - * because it may be genuinely unknown, in which case we'll get two sets of "??"/"??:0" - * lines. + * Send our request and then *deliberately* send something that can't be + * interpreted as a valid address to ask addr2line about (namely, + * ","). This causes addr2line to first write out the answer to our + * request, in an unbounded/unknown number of records, and then to write + * out the lines "0x0...0", "??" and "??:0", for GNU binutils, or "," + * for llvm-addr2line, so that we can detect when it has finished giving + * us anything useful. */ len = snprintf(buf, sizeof(buf), "%016"PRIx64"\n,\n", addr); written = len > 0 ? write(a2l->in, buf, len) : -1; @@ -673,7 +684,7 @@ static int addr2line(const char *dso_name, u64 addr, } io__init(&io, a2l->out, buf, sizeof(buf)); io.timeout_ms = addr2line_timeout_ms; - switch (read_addr2line_record(&io, a2l_style, + switch (read_addr2line_record(&io, a2l_style, dso_name, addr, /*first=*/true, &record_function, &record_filename, &record_line_nr)) { case -1: if (!symbol_conf.disable_add2line_warn) @@ -683,16 +694,21 @@ static int addr2line(const char *dso_name, u64 addr, /* * The first record was invalid, so return failure, but first * read another record, since we sent a sentinel ',' for the - * sake of detected the last inlined function. + * sake of detected the last inlined function. Treat this as the + * first of a record as the ',' generates a new start with GNU + * binutils, also force a non-zero address as we're no longer + * reading that record. */ - switch (read_addr2line_record(&io, a2l_style, NULL, NULL, NULL)) { + switch (read_addr2line_record(&io, a2l_style, dso_name, + /*addr=*/1, /*first=*/true, + NULL, NULL, NULL)) { case -1: if (!symbol_conf.disable_add2line_warn) - pr_warning("%s %s: could not read delimiter record\n", + pr_warning("%s %s: could not read sentinel record\n", __func__, dso_name); break; case 0: - /* As expected. */ + /* The sentinel as expected. */ break; default: if (!symbol_conf.disable_add2line_warn) @@ -702,6 +718,7 @@ static int addr2line(const char *dso_name, u64 addr, } goto out; default: + /* First record as expected. */ break; } @@ -722,9 +739,16 @@ static int addr2line(const char *dso_name, u64 addr, } } - /* We have to read the records even if we don't care about the inline info. */ + /* + * We have to read the records even if we don't care about the inline + * info. This isn't the first record and force the address to non-zero + * as we're reading records beyond the first. + */ while ((record_status = read_addr2line_record(&io, a2l_style, + dso_name, + /*addr=*/1, + /*first=*/false, &record_function, &record_filename, &record_line_nr)) == 1) { -- cgit v1.2.3 From e15e4a3d7da9521632c39a1f1bfa1e30f80e0415 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 06:53:08 -0700 Subject: perf evsel: Fix the annotation for hardware events on hybrid The annotation for hardware events is wrong on hybrid. For example, # ./perf stat -a sleep 1 Performance counter stats for 'system wide': 32,148.85 msec cpu-clock # 32.000 CPUs utilized 374 context-switches # 11.633 /sec 33 cpu-migrations # 1.026 /sec 295 page-faults # 9.176 /sec 18,979,960 cpu_core/cycles/ # 590.378 K/sec 261,230,783 cpu_atom/cycles/ # 8.126 M/sec (54.21%) 17,019,732 cpu_core/instructions/ # 529.404 K/sec 38,020,470 cpu_atom/instructions/ # 1.183 M/sec (63.36%) 3,296,743 cpu_core/branches/ # 102.546 K/sec 6,692,338 cpu_atom/branches/ # 208.167 K/sec (63.40%) 96,421 cpu_core/branch-misses/ # 2.999 K/sec 1,016,336 cpu_atom/branch-misses/ # 31.613 K/sec (63.38%) The hardware events have extended type on hybrid, but the evsel__match() doesn't take it into account. Filter the config on hybrid before checking. With the patch, # ./perf stat -a sleep 1 Performance counter stats for 'system wide': 32,139.90 msec cpu-clock # 32.003 CPUs utilized 343 context-switches # 10.672 /sec 32 cpu-migrations # 0.996 /sec 73 page-faults # 2.271 /sec 13,712,841 cpu_core/cycles/ # 0.000 GHz 258,301,691 cpu_atom/cycles/ # 0.008 GHz (54.20%) 12,428,163 cpu_core/instructions/ # 0.91 insn per cycle 37,786,557 cpu_atom/instructions/ # 2.76 insn per cycle (63.35%) 2,418,826 cpu_core/branches/ # 75.259 K/sec 6,965,962 cpu_atom/branches/ # 216.739 K/sec (63.38%) 72,150 cpu_core/branch-misses/ # 2.98% of all branches 1,032,746 cpu_atom/branch-misses/ # 42.70% of all branches (63.35%) Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230615135315.3662428-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b365b449c6ea..cc6fb3049b99 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -10,6 +10,7 @@ #include #include #include "symbol_conf.h" +#include "pmus.h" struct bpf_object; struct cgroup; @@ -350,9 +351,19 @@ u64 format_field__intval(struct tep_format_field *field, struct perf_sample *sam struct tep_format_field *evsel__field(struct evsel *evsel, const char *name); -#define evsel__match(evsel, t, c) \ - (evsel->core.attr.type == PERF_TYPE_##t && \ - evsel->core.attr.config == PERF_COUNT_##c) +static inline bool __evsel__match(const struct evsel *evsel, u32 type, u64 config) +{ + if (evsel->core.attr.type != type) + return false; + + if ((type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) && + perf_pmus__supports_extended_type()) + return (evsel->core.attr.config & PERF_HW_EVENT_MASK) == config; + + return evsel->core.attr.config == config; +} + +#define evsel__match(evsel, t, c) __evsel__match(evsel, PERF_TYPE_##t, PERF_COUNT_##c) static inline bool evsel__match2(struct evsel *e1, struct evsel *e2) { -- cgit v1.2.3 From 969a4661440808a820361c25a59d59cd9d3a9978 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 06:53:09 -0700 Subject: perf metric: JSON flag to default metric group For the default output, the default metric group could vary on different platforms. For example, on SPR, the TopdownL1 and TopdownL2 metrics should be displayed in the default mode. On ICL, only the TopdownL1 should be displayed. Add a flag so we can tag the default metric group for different platforms rather than hack the perf code. The flag is added to Intel TopdownL1 since ICL and ADL, TopdownL2 metrics since SPR. Add a new field, DefaultMetricgroupName, in the JSON file to indicate the real metric group name. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230615135315.3662428-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/x86/alderlake/adl-metrics.json | 45 +++++++++------- .../arch/x86/alderlaken/adln-metrics.json | 25 +++++---- .../pmu-events/arch/x86/icelake/icl-metrics.json | 20 +++++--- .../pmu-events/arch/x86/icelakex/icx-metrics.json | 20 +++++--- .../arch/x86/sapphirerapids/spr-metrics.json | 60 +++++++++++++--------- .../pmu-events/arch/x86/tigerlake/tgl-metrics.json | 20 +++++--- 6 files changed, 114 insertions(+), 76 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index c9f7e3d4ab08..85fb975b6f56 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -129,33 +129,36 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "TOPDOWN_BE_BOUND.ALL / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.", "ScaleUnit": "100%", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "tma_backend_bound", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound_aux", "MetricThreshold": "tma_backend_bound_aux > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.", "ScaleUnit": "100%", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "(tma_info_core_slots - (cpu_atom@TOPDOWN_FE_BOUND.ALL@ + cpu_atom@TOPDOWN_BE_BOUND.ALL@ + cpu_atom@TOPDOWN_RETIRING.ALL@)) / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", "ScaleUnit": "100%", "Unit": "cpu_atom" @@ -295,11 +298,12 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "TOPDOWN_FE_BOUND.ALL / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "ScaleUnit": "100%", "Unit": "cpu_atom" }, @@ -722,11 +726,12 @@ }, { "BriefDescription": "Counts the numer of issue slots that result in retirement slots.", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "TOPDOWN_RETIRING.ALL / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.75", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "ScaleUnit": "100%", "Unit": "cpu_atom" }, @@ -832,22 +837,24 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -1112,11 +1119,12 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots", - "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", "ScaleUnit": "100%", "Unit": "cpu_core" @@ -2316,11 +2324,12 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", "ScaleUnit": "100%", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json index ed9ff25a03cf..0f1628d698da 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json @@ -94,31 +94,34 @@ }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "TOPDOWN_BE_BOUND.ALL / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.", "ScaleUnit": "100%" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "tma_backend_bound", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound_aux", "MetricThreshold": "tma_backend_bound_aux > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.", "ScaleUnit": "100%" }, { "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "(tma_info_core_slots - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", "ScaleUnit": "100%" }, @@ -243,11 +246,12 @@ }, { "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "TOPDOWN_FE_BOUND.ALL / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "ScaleUnit": "100%" }, { @@ -612,11 +616,12 @@ }, { "BriefDescription": "Counts the numer of issue slots that result in retirement slots.", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "TOPDOWN_RETIRING.ALL / tma_info_core_slots", - "MetricGroup": "TopdownL1;tma_L1_group", + "MetricGroup": "Default;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.75", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json index 20210742171d..cc4edf855064 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json @@ -111,21 +111,23 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "ScaleUnit": "100%" }, @@ -372,11 +374,12 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", - "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", "ScaleUnit": "100%" }, @@ -1378,11 +1381,12 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", "ScaleUnit": "100%" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index ef25cda019be..6f25b5b7aaf6 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -315,21 +315,23 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "ScaleUnit": "100%" }, @@ -576,11 +578,12 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", - "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", "ScaleUnit": "100%" }, @@ -1674,11 +1677,12 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", "ScaleUnit": "100%" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 4f3dd85540b6..c732982f70b5 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -340,31 +340,34 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", + "MetricGroup": "BadSpec;BrMispredicts;Default;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, @@ -407,11 +410,12 @@ }, { "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", - "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricGroup": "Backend;Compute;Default;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_core_bound", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "ScaleUnit": "100%" }, @@ -509,21 +513,23 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", - "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", + "MetricGroup": "Default;FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", - "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricGroup": "Default;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", "ScaleUnit": "100%" }, @@ -611,11 +617,12 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", - "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", "ScaleUnit": "100%" }, @@ -630,11 +637,12 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricGroup": "Default;Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", "ScaleUnit": "100%" }, @@ -1486,11 +1494,12 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", - "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricGroup": "Default;Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_light_operations", "MetricThreshold": "tma_light_operations > 0.6", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "ScaleUnit": "100%" }, @@ -1540,11 +1549,12 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", - "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", + "MetricGroup": "BadSpec;Default;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricName": "tma_machine_clears", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "ScaleUnit": "100%" }, @@ -1576,11 +1586,12 @@ }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "DefaultMetricgroupName": "TopdownL2", "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricGroup": "Backend;Default;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL2", + "MetricgroupNoGroup": "TopdownL2;Default", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "ScaleUnit": "100%" }, @@ -1784,11 +1795,12 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", "ScaleUnit": "100%" }, diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json index d0538a754288..83346911aa63 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json @@ -105,21 +105,23 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", "ScaleUnit": "100%" }, { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "ScaleUnit": "100%" }, @@ -366,11 +368,12 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", - "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", "ScaleUnit": "100%" }, @@ -1392,11 +1395,12 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "DefaultMetricgroupName": "TopdownL1", "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", - "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", - "MetricgroupNoGroup": "TopdownL1", + "MetricgroupNoGroup": "TopdownL1;Default", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", "ScaleUnit": "100%" }, -- cgit v1.2.3 From b0a9e8f81fc45e6d3c9ddf290dabd7f4610f2939 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 06:53:10 -0700 Subject: perf stat,jevents: Introduce Default tags for the default mode Introduce a new metricgroup, Default, to tag all the metric groups which will be collected in the default mode. Add a new field, DefaultMetricgroupName, in the JSON file to indicate the real metric group name. It will be printed in the default output to replace the event names. There is nothing changed for the output format. On SPR, both TopdownL1 and TopdownL2 are displayed in the default output. On ARM, Intel ICL and later platforms (before SPR), only TopdownL1 is displayed in the default output. Suggested-by: Stephane Eranian Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230615135315.3662428-4-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- tools/perf/pmu-events/jevents.py | 5 +++-- tools/perf/pmu-events/pmu-events.h | 1 + tools/perf/util/metricgroup.c | 6 ++++++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index fc615bdeed4f..55601b4b5c34 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2154,14 +2154,14 @@ static int add_default_attributes(void) * Add TopdownL1 metrics if they exist. To minimize * multiplexing, don't request threshold computation. */ - if (metricgroup__has_metric(pmu, "TopdownL1")) { + if (metricgroup__has_metric(pmu, "Default")) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; if (!metric_evlist) return -1; - if (metricgroup__parse_groups(metric_evlist, pmu, "TopdownL1", + if (metricgroup__parse_groups(metric_evlist, pmu, "Default", /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/true, diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 7ed258be1829..12e80bb7939b 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -54,8 +54,8 @@ _json_event_attributes = [ # Attributes that are in pmu_metric rather than pmu_event. _json_metric_attributes = [ 'pmu', 'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', - 'desc', 'long_desc', 'unit', 'compat', 'metricgroup_no_group', 'aggr_mode', - 'event_grouping' + 'desc', 'long_desc', 'unit', 'compat', 'metricgroup_no_group', + 'default_metricgroup_name', 'aggr_mode', 'event_grouping' ] # Attributes that are bools or enum int values, encoded as '0', '1',... _json_enum_attributes = ['aggr_mode', 'deprecated', 'event_grouping', 'perpkg'] @@ -307,6 +307,7 @@ class JsonEvent: self.metric_name = jd.get('MetricName') self.metric_group = jd.get('MetricGroup') self.metricgroup_no_group = jd.get('MetricgroupNoGroup') + self.default_metricgroup_name = jd.get('DefaultMetricgroupName') self.event_grouping = convert_metric_constraint(jd.get('MetricConstraint')) self.metric_expr = None if 'MetricExpr' in jd: diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index 8cd23d656a5d..caf59f23cd64 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -61,6 +61,7 @@ struct pmu_metric { const char *desc; const char *long_desc; const char *metricgroup_no_group; + const char *default_metricgroup_name; enum aggr_mode_class aggr_mode; enum metric_event_groups event_grouping; }; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 74f2d8efc02d..8b19644ade7d 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -137,6 +137,11 @@ struct metric { * output. */ const char *metric_unit; + /** + * Optional name of the metric group reported + * if the Default metric group is being processed. + */ + const char *default_metricgroup_name; /** Optional null terminated array of referenced metrics. */ struct metric_ref *metric_refs; /** @@ -219,6 +224,7 @@ static struct metric *metric__new(const struct pmu_metric *pm, m->pmu = pm->pmu ?: "cpu"; m->metric_name = pm->metric_name; + m->default_metricgroup_name = pm->default_metricgroup_name; m->modifier = NULL; if (modifier) { m->modifier = strdup(modifier); -- cgit v1.2.3 From 18b687d7ef90d1dd56c4df3be7365977861f5d82 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 06:53:13 -0700 Subject: pert tests: Update metric-value for perf stat JSON output There may be multiplexing triggered, e.g., e-core of ADL. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230615135315.3662428-7-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/perf_json_output_lint.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py index 5e9bd68c83fe..ea55d5ea1ced 100644 --- a/tools/perf/tests/shell/lib/perf_json_output_lint.py +++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py @@ -66,10 +66,10 @@ def check_json_output(expected_items): for item in json.loads(input): if expected_items != -1: count = len(item) - if count != expected_items and count >= 1 and count <= 4 and 'metric-value' in item: + if count != expected_items and count >= 1 and count <= 6 and 'metric-value' in item: # Events that generate >1 metric may have isolated metric - # values and possibly other prefixes like interval, core and - # aggregate-number. + # values and possibly other prefixes like interval, core, + # aggregate-number, or event-runtime/pcnt-running from multiplexing. pass elif count != expected_items and count >= 1 and count <= 5 and 'metricgroup' in item: pass -- cgit v1.2.3 From 1c0e47956a8e1109ad9635b7fab3f2de515dd598 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 20:14:16 -0700 Subject: perf metrics: Sort the Default metricgroup The new default mode will print the metrics as a metric group. The metrics from the same metric group must be adjacent to each other in the metric list. But the metric_list_cmp() sorts metrics by the number of events. Add a new sort for the Default metricgroup, which sorts by default_metricgroup_name and metric_name. Add is_default in the struct metric_event to indicate that it's from the Default metricgroup. Store the displayed metricgroup name of the Default metricgroup into the metric expr for output. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230616031420.3751973-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 26 ++++++++++++++++++++++++++ tools/perf/util/metricgroup.h | 3 +++ 2 files changed, 29 insertions(+) diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 8b19644ade7d..a6a5ed44a679 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -79,6 +79,7 @@ static struct rb_node *metric_event_new(struct rblist *rblist __maybe_unused, return NULL; memcpy(me, entry, sizeof(struct metric_event)); me->evsel = ((struct metric_event *)entry)->evsel; + me->is_default = false; INIT_LIST_HEAD(&me->head); return &me->nd; } @@ -1160,6 +1161,25 @@ static int metric_list_cmp(void *priv __maybe_unused, const struct list_head *l, return right_count - left_count; } +/** + * default_metricgroup_cmp - Implements complex key for the Default metricgroup + * that first sorts by default_metricgroup_name, then + * metric_name. + */ +static int default_metricgroup_cmp(void *priv __maybe_unused, + const struct list_head *l, + const struct list_head *r) +{ + const struct metric *left = container_of(l, struct metric, nd); + const struct metric *right = container_of(r, struct metric, nd); + int diff = strcmp(right->default_metricgroup_name, left->default_metricgroup_name); + + if (diff) + return diff; + + return strcmp(right->metric_name, left->metric_name); +} + struct metricgroup__add_metric_data { struct list_head *list; const char *pmu; @@ -1515,6 +1535,7 @@ static int parse_groups(struct evlist *perf_evlist, LIST_HEAD(metric_list); struct metric *m; bool tool_events[PERF_TOOL_MAX] = {false}; + bool is_default = !strcmp(str, "Default"); int ret; if (metric_events_list->nr_entries == 0) @@ -1549,6 +1570,9 @@ static int parse_groups(struct evlist *perf_evlist, goto out; } + if (is_default) + list_sort(NULL, &metric_list, default_metricgroup_cmp); + list_for_each_entry(m, &metric_list, nd) { struct metric_event *me; struct evsel **metric_events; @@ -1637,6 +1661,8 @@ static int parse_groups(struct evlist *perf_evlist, expr->metric_unit = m->metric_unit; expr->metric_events = metric_events; expr->runtime = m->pctx->sctx.runtime; + expr->default_metricgroup_name = m->default_metricgroup_name; + me->is_default = is_default; list_add(&expr->nd, &me->head); } diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index bf18274c15df..d5325c6ec8e1 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -22,6 +22,7 @@ struct cgroup; struct metric_event { struct rb_node nd; struct evsel *evsel; + bool is_default; /* the metric evsel from the Default metricgroup */ struct list_head head; /* list of metric_expr */ }; @@ -55,6 +56,8 @@ struct metric_expr { * more human intelligible) and then add "MiB" afterward when displayed. */ const char *metric_unit; + /** Displayed metricgroup name of the Default metricgroup */ + const char *default_metricgroup_name; /** Null terminated array of events used by the metric. */ struct evsel **metric_events; /** Null terminated array of referenced metrics. */ -- cgit v1.2.3 From 6a80d794d796d22910c03d3e52a3bf0d885326a7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 20:14:17 -0700 Subject: perf stat: New metricgroup output for the default mode In the default mode, the current output of the metricgroup include both events and metrics, which is not necessary and just makes the output hard to read. Since different ARCHs (even different generations in the same ARCH) may use different events. The output also vary on different platforms. For a metricgroup, only outputting the value of each metric is good enough. Add a new field default_metricgroup in evsel to indicate an event of the default metricgroup. For those events, printout() should print the metricgroup name rather than each event. Add perf_stat__skip_metric_event() to skip the evsel in the Default metricgroup, if it's not running or not the metric event. Add print_metricgroup_header_t to pass the functions which print the display name of each metricgroup in the Default metricgroup. Support all three output methods. Factor out perf_stat__print_shadow_stats_metricgroup() to print out each metrics. On SPR: Before: ./perf_old stat sleep 1 Performance counter stats for 'sleep 1': 0.54 msec task-clock:u # 0.001 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 68 page-faults:u # 125.445 K/sec 540,970 cycles:u # 0.998 GHz 556,325 instructions:u # 1.03 insn per cycle 123,602 branches:u # 228.018 M/sec 6,889 branch-misses:u # 5.57% of all branches 3,245,820 TOPDOWN.SLOTS:u # 18.4 % tma_backend_bound # 17.2 % tma_retiring # 23.1 % tma_bad_speculation # 41.4 % tma_frontend_bound 564,859 topdown-retiring:u 1,370,999 topdown-fe-bound:u 603,271 topdown-be-bound:u 744,874 topdown-bad-spec:u 12,661 INT_MISC.UOP_DROPPING:u # 23.357 M/sec 1.001798215 seconds time elapsed 0.000193000 seconds user 0.001700000 seconds sys After: $ ./perf stat sleep 1 Performance counter stats for 'sleep 1': 0.51 msec task-clock:u # 0.001 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 68 page-faults:u # 132.683 K/sec 545,228 cycles:u # 1.064 GHz 555,509 instructions:u # 1.02 insn per cycle 123,574 branches:u # 241.120 M/sec 6,957 branch-misses:u # 5.63% of all branches TopdownL1 # 17.5 % tma_backend_bound # 22.6 % tma_bad_speculation # 42.7 % tma_frontend_bound # 17.1 % tma_retiring TopdownL2 # 21.8 % tma_branch_mispredicts # 11.5 % tma_core_bound # 13.4 % tma_fetch_bandwidth # 29.3 % tma_fetch_latency # 2.7 % tma_heavy_operations # 14.5 % tma_light_operations # 0.8 % tma_machine_clears # 6.1 % tma_memory_bound 1.001712086 seconds time elapsed 0.000151000 seconds user 0.001618000 seconds sys Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230616031420.3751973-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 + tools/perf/util/evsel.h | 1 + tools/perf/util/stat-display.c | 108 +++++++++++++++++++++++++++++---- tools/perf/util/stat-shadow.c | 131 +++++++++++++++++++++++++++++++++++++---- tools/perf/util/stat.h | 15 +++++ 5 files changed, 234 insertions(+), 22 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 55601b4b5c34..3f4e76f76f94 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2172,6 +2172,7 @@ static int add_default_attributes(void) evlist__for_each_entry(metric_evlist, metric_evsel) { metric_evsel->skippable = true; + metric_evsel->default_metricgroup = true; } evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries); evlist__delete(metric_evlist); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index cc6fb3049b99..9f06d6cd5379 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -131,6 +131,7 @@ struct evsel { bool reset_group; bool errored; bool needs_auxtrace_mmap; + bool default_metricgroup; /* A member of the Default metricgroup */ struct hashmap *per_pkg_mask; int err; struct { diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index a2bbdc25d979..7329b3340f88 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -25,6 +25,7 @@ #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" +#define MGROUP_LEN 50 #define METRIC_LEN 38 #define EVNAME_LEN 32 #define COUNTS_LEN 18 @@ -364,16 +365,27 @@ static void new_line_std(struct perf_stat_config *config __maybe_unused, os->newline = true; } -static void do_new_line_std(struct perf_stat_config *config, - struct outstate *os) +static inline void __new_line_std_csv(struct perf_stat_config *config, + struct outstate *os) { fputc('\n', os->fh); if (os->prefix) fputs(os->prefix, os->fh); aggr_printout(config, os->evsel, os->id, os->aggr_nr); +} + +static inline void __new_line_std(struct outstate *os) +{ + fprintf(os->fh, " "); +} + +static void do_new_line_std(struct perf_stat_config *config, + struct outstate *os) +{ + __new_line_std_csv(config, os); if (config->aggr_mode == AGGR_NONE) fprintf(os->fh, " "); - fprintf(os->fh, " "); + __new_line_std(os); } static void print_metric_std(struct perf_stat_config *config, @@ -408,10 +420,7 @@ static void new_line_csv(struct perf_stat_config *config, void *ctx) struct outstate *os = ctx; int i; - fputc('\n', os->fh); - if (os->prefix) - fprintf(os->fh, "%s", os->prefix); - aggr_printout(config, os->evsel, os->id, os->aggr_nr); + __new_line_std_csv(config, os); for (i = 0; i < os->nfields; i++) fputs(config->csv_sep, os->fh); } @@ -462,6 +471,54 @@ static void new_line_json(struct perf_stat_config *config, void *ctx) aggr_printout(config, os->evsel, os->id, os->aggr_nr); } +static void print_metricgroup_header_json(struct perf_stat_config *config, + void *ctx, + const char *metricgroup_name) +{ + if (!metricgroup_name) + return; + + fprintf(config->output, "\"metricgroup\" : \"%s\"}", metricgroup_name); + new_line_json(config, ctx); +} + +static void print_metricgroup_header_csv(struct perf_stat_config *config, + void *ctx, + const char *metricgroup_name) +{ + struct outstate *os = ctx; + int i; + + if (!metricgroup_name) { + /* Leave space for running and enabling */ + for (i = 0; i < os->nfields - 2; i++) + fputs(config->csv_sep, os->fh); + return; + } + + for (i = 0; i < os->nfields; i++) + fputs(config->csv_sep, os->fh); + fprintf(config->output, "%s", metricgroup_name); + new_line_csv(config, ctx); +} + +static void print_metricgroup_header_std(struct perf_stat_config *config, + void *ctx, + const char *metricgroup_name) +{ + struct outstate *os = ctx; + int n; + + if (!metricgroup_name) { + __new_line_std(os); + return; + } + + n = fprintf(config->output, " %*s", EVNAME_LEN, metricgroup_name); + + fprintf(config->output, "%*s", MGROUP_LEN - n - 1, ""); +} + /* Filter out some columns that don't work well in metrics only mode */ static bool valid_only_metric(const char *unit) @@ -713,19 +770,23 @@ static void printout(struct perf_stat_config *config, struct outstate *os, struct perf_stat_output_ctx out; print_metric_t pm; new_line_t nl; + print_metricgroup_header_t pmh; bool ok = true; struct evsel *counter = os->evsel; if (config->csv_output) { pm = config->metric_only ? print_metric_only_csv : print_metric_csv; nl = config->metric_only ? new_line_metric : new_line_csv; + pmh = print_metricgroup_header_csv; os->nfields = 4 + (counter->cgrp ? 1 : 0); } else if (config->json_output) { pm = config->metric_only ? print_metric_only_json : print_metric_json; nl = config->metric_only ? new_line_metric : new_line_json; + pmh = print_metricgroup_header_json; } else { pm = config->metric_only ? print_metric_only : print_metric_std; nl = config->metric_only ? new_line_metric : new_line_std; + pmh = print_metricgroup_header_std; } if (run == 0 || ena == 0 || counter->counts->scaled == -1) { @@ -747,10 +808,11 @@ static void printout(struct perf_stat_config *config, struct outstate *os, out.print_metric = pm; out.new_line = nl; + out.print_metricgroup_header = pmh; out.ctx = os; out.force_header = false; - if (!config->metric_only) { + if (!config->metric_only && !counter->default_metricgroup) { abs_printout(config, os->id, os->aggr_nr, counter, uval, ok); print_noise(config, counter, noise, /*before_metric=*/true); @@ -758,8 +820,31 @@ static void printout(struct perf_stat_config *config, struct outstate *os, } if (ok) { - perf_stat__print_shadow_stats(config, counter, uval, aggr_idx, - &out, &config->metric_events); + if (!config->metric_only && counter->default_metricgroup) { + void *from = NULL; + + aggr_printout(config, os->evsel, os->id, os->aggr_nr); + /* Print out all the metricgroup with the same metric event. */ + do { + int num = 0; + + /* Print out the new line for the next new metricgroup. */ + if (from) { + if (config->json_output) + new_line_json(config, (void *)os); + else + __new_line_std_csv(config, os); + } + + print_noise(config, counter, noise, /*before_metric=*/true); + print_running(config, run, ena, /*before_metric=*/true); + from = perf_stat__print_shadow_stats_metricgroup(config, counter, aggr_idx, + &num, from, &out, + &config->metric_events); + } while (from != NULL); + } else + perf_stat__print_shadow_stats(config, counter, uval, aggr_idx, + &out, &config->metric_events); } else { pm(config, os, /*color=*/NULL, /*format=*/NULL, /*unit=*/"", /*val=*/0); } @@ -889,6 +974,9 @@ static void print_counter_aggrdata(struct perf_stat_config *config, ena = aggr->counts.ena; run = aggr->counts.run; + if (perf_stat__skip_metric_event(counter, &config->metric_events, ena, run)) + return; + if (val == 0 && should_skip_zero_counter(config, counter, &id)) return; diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 1566a206ba42..1c5c3eeba4cf 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -539,6 +539,106 @@ out: return ratio; } +static void perf_stat__print_metricgroup_header(struct perf_stat_config *config, + struct evsel *evsel, + void *ctxp, + const char *name, + struct perf_stat_output_ctx *out) +{ + bool need_full_name = perf_pmus__num_core_pmus() > 1; + static const char *last_name; + static const char *last_pmu; + char full_name[64]; + + /* + * A metricgroup may have several metric events, + * e.g.,TopdownL1 on e-core of ADL. + * The name has been output by the first metric + * event. Only align with other metics from + * different metric events. + */ + if (last_name && !strcmp(last_name, name)) { + if (!need_full_name || !strcmp(last_pmu, evsel->pmu_name)) { + out->print_metricgroup_header(config, ctxp, NULL); + return; + } + } + + if (need_full_name) + scnprintf(full_name, sizeof(full_name), "%s (%s)", name, evsel->pmu_name); + else + scnprintf(full_name, sizeof(full_name), "%s", name); + + out->print_metricgroup_header(config, ctxp, full_name); + + last_name = name; + last_pmu = evsel->pmu_name; +} + +/** + * perf_stat__print_shadow_stats_metricgroup - Print out metrics associated with the evsel + * For the non-default, all metrics associated + * with the evsel are printed. + * For the default mode, only the metrics from + * the same metricgroup and the name of the + * metricgroup are printed. To print the metrics + * from the next metricgroup (if available), + * invoke the function with correspoinding + * metric_expr. + */ +void *perf_stat__print_shadow_stats_metricgroup(struct perf_stat_config *config, + struct evsel *evsel, + int aggr_idx, + int *num, + void *from, + struct perf_stat_output_ctx *out, + struct rblist *metric_events) +{ + struct metric_event *me; + struct metric_expr *mexp = from; + void *ctxp = out->ctx; + bool header_printed = false; + const char *name = NULL; + + me = metricgroup__lookup(metric_events, evsel, false); + if (me == NULL) + return NULL; + + if (!mexp) + mexp = list_first_entry(&me->head, typeof(*mexp), nd); + + list_for_each_entry_from(mexp, &me->head, nd) { + /* Print the display name of the Default metricgroup */ + if (!config->metric_only && me->is_default) { + if (!name) + name = mexp->default_metricgroup_name; + /* + * Two or more metricgroup may share the same metric + * event, e.g., TopdownL1 and TopdownL2 on SPR. + * Return and print the prefix, e.g., noise, running + * for the next metricgroup. + */ + if (strcmp(name, mexp->default_metricgroup_name)) + return (void *)mexp; + /* Only print the name of the metricgroup once */ + if (!header_printed) { + header_printed = true; + perf_stat__print_metricgroup_header(config, evsel, ctxp, + name, out); + } + } + + if ((*num)++ > 0) + out->new_line(config, ctxp); + generic_metric(config, mexp->metric_expr, mexp->metric_threshold, + mexp->metric_events, mexp->metric_refs, evsel->name, + mexp->metric_name, mexp->metric_unit, mexp->runtime, + aggr_idx, out); + } + + return NULL; +} + void perf_stat__print_shadow_stats(struct perf_stat_config *config, struct evsel *evsel, double avg, int aggr_idx, @@ -565,7 +665,6 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, }; print_metric_t print_metric = out->print_metric; void *ctxp = out->ctx; - struct metric_event *me; int num = 1; if (config->iostat_run) { @@ -592,18 +691,26 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, } } - if ((me = metricgroup__lookup(metric_events, evsel, false)) != NULL) { - struct metric_expr *mexp; + perf_stat__print_shadow_stats_metricgroup(config, evsel, aggr_idx, + &num, NULL, out, metric_events); - list_for_each_entry (mexp, &me->head, nd) { - if (num++ > 0) - out->new_line(config, ctxp); - generic_metric(config, mexp->metric_expr, mexp->metric_threshold, - mexp->metric_events, mexp->metric_refs, evsel->name, - mexp->metric_name, mexp->metric_unit, mexp->runtime, - aggr_idx, out); - } - } if (num == 0) print_metric(config, ctxp, NULL, NULL, NULL, 0); } + +/** + * perf_stat__skip_metric_event - Skip the evsel in the Default metricgroup, + * if it's not running or not the metric event. + */ +bool perf_stat__skip_metric_event(struct evsel *evsel, + struct rblist *metric_events, + u64 ena, u64 run) +{ + if (!evsel->default_metricgroup) + return false; + + if (!ena || !run) + return true; + + return !metricgroup__lookup(metric_events, evsel, false); +} diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 7abff7cbb5a1..934f79778cea 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -158,11 +158,16 @@ typedef void (*print_metric_t)(struct perf_stat_config *config, const char *fmt, double val); typedef void (*new_line_t)(struct perf_stat_config *config, void *ctx); +/* Used to print the display name of the Default metricgroup for now. */ +typedef void (*print_metricgroup_header_t)(struct perf_stat_config *config, + void *ctx, const char *metricgroup_name); + void perf_stat__reset_shadow_stats(void); struct perf_stat_output_ctx { void *ctx; print_metric_t print_metric; new_line_t new_line; + print_metricgroup_header_t print_metricgroup_header; bool force_header; }; @@ -171,6 +176,16 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, double avg, int aggr_idx, struct perf_stat_output_ctx *out, struct rblist *metric_events); +bool perf_stat__skip_metric_event(struct evsel *evsel, + struct rblist *metric_events, + u64 ena, u64 run); +void *perf_stat__print_shadow_stats_metricgroup(struct perf_stat_config *config, + struct evsel *evsel, + int aggr_idx, + int *num, + void *from, + struct perf_stat_output_ctx *out, + struct rblist *metric_events); int evlist__alloc_stats(struct perf_stat_config *config, struct evlist *evlist, bool alloc_raw); -- cgit v1.2.3 From fc51fc87b1b84db83907c125bda6b05d52ec21ca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 20:14:18 -0700 Subject: perf test: Move all the check functions of stat CSV output to lib These functions can be shared with the stat std output test. There is no functional change. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230616031420.3751973-4-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/stat_output.sh | 169 +++++++++++++++++++++++++++ tools/perf/tests/shell/stat+csv_output.sh | 188 +++--------------------------- 2 files changed, 184 insertions(+), 173 deletions(-) create mode 100755 tools/perf/tests/shell/lib/stat_output.sh diff --git a/tools/perf/tests/shell/lib/stat_output.sh b/tools/perf/tests/shell/lib/stat_output.sh new file mode 100755 index 000000000000..363979b1123d --- /dev/null +++ b/tools/perf/tests/shell/lib/stat_output.sh @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Return true if perf_event_paranoid is > $1 and not running as root. +function ParanoidAndNotRoot() +{ + [ "$(id -u)" != 0 ] && [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt $1 ] +} + +# $1 name $2 extra_opt +check_no_args() +{ + echo -n "Checking $1 output: no args" + perf stat $2 true + commachecker --no-args + echo "[Success]" +} + +check_system_wide() +{ + echo -n "Checking $1 output: system wide " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat -a $2 true + commachecker --system-wide + echo "[Success]" +} + +check_system_wide_no_aggr() +{ + echo -n "Checking $1 output: system wide no aggregation " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat -A -a --no-merge $2 true + commachecker --system-wide-no-aggr + echo "[Success]" +} + +check_interval() +{ + echo -n "Checking $1 output: interval " + perf stat -I 1000 $2 true + commachecker --interval + echo "[Success]" +} + +check_event() +{ + echo -n "Checking $1 output: event " + perf stat -e cpu-clock $2 true + commachecker --event + echo "[Success]" +} + +check_per_core() +{ + echo -n "Checking $1 output: per core " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat --per-core -a $2 true + commachecker --per-core + echo "[Success]" +} + +check_per_thread() +{ + echo -n "Checking $1 output: per thread " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat --per-thread -a $2 true + commachecker --per-thread + echo "[Success]" +} + +check_per_cache_instance() +{ + echo -n "Checking $1 output: per cache instance " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat --per-cache -a $2 true + commachecker --per-cache + echo "[Success]" +} + +check_per_die() +{ + echo -n "Checking $1 output: per die " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat --per-die -a $2 true + commachecker --per-die + echo "[Success]" +} + +check_per_node() +{ + echo -n "Checking $1 output: per node " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat --per-node -a $2 true + commachecker --per-node + echo "[Success]" +} + +check_per_socket() +{ + echo -n "Checking $1 output: per socket " + if ParanoidAndNotRoot 0 + then + echo "[Skip] paranoid and not root" + return + fi + perf stat --per-socket -a $2 true + commachecker --per-socket + echo "[Success]" +} + +# The perf stat options for per-socket, per-core, per-die +# and -A ( no_aggr mode ) uses the info fetched from this +# directory: "/sys/devices/system/cpu/cpu*/topology". For +# example, socket value is fetched from "physical_package_id" +# file in topology directory. +# Reference: cpu__get_topology_int in util/cpumap.c +# If the platform doesn't expose topology information, values +# will be set to -1. For example, incase of pSeries platform +# of powerpc, value for "physical_package_id" is restricted +# and set to -1. Check here validates the socket-id read from +# topology file before proceeding further + +FILE_LOC="/sys/devices/system/cpu/cpu*/topology/" +FILE_NAME="physical_package_id" + +function check_for_topology() +{ + if ! ParanoidAndNotRoot 0 + then + socket_file=`ls $FILE_LOC/$FILE_NAME | head -n 1` + [ -z $socket_file ] && { + echo 0 + return + } + socket_id=`cat $socket_file` + [ $socket_id == -1 ] && { + echo 1 + return + } + fi + echo 0 +} diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh index ed082daf839c..34a0701fee05 100755 --- a/tools/perf/tests/shell/stat+csv_output.sh +++ b/tools/perf/tests/shell/stat+csv_output.sh @@ -6,7 +6,8 @@ set -e -skip_test=0 +. $(dirname $0)/lib/stat_output.sh + csv_sep=@ stat_output=$(mktemp /tmp/__perf_test.stat_output.csv.XXXXX) @@ -63,181 +64,22 @@ function commachecker() return 0 } -# Return true if perf_event_paranoid is > $1 and not running as root. -function ParanoidAndNotRoot() -{ - [ "$(id -u)" != 0 ] && [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt $1 ] -} - -check_no_args() -{ - echo -n "Checking CSV output: no args " - perf stat -x$csv_sep -o "${stat_output}" true - commachecker --no-args - echo "[Success]" -} - -check_system_wide() -{ - echo -n "Checking CSV output: system wide " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep -a -o "${stat_output}" true - commachecker --system-wide - echo "[Success]" -} - -check_system_wide_no_aggr() -{ - echo -n "Checking CSV output: system wide no aggregation " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep -A -a --no-merge -o "${stat_output}" true - commachecker --system-wide-no-aggr - echo "[Success]" -} - -check_interval() -{ - echo -n "Checking CSV output: interval " - perf stat -x$csv_sep -I 1000 -o "${stat_output}" true - commachecker --interval - echo "[Success]" -} - - -check_event() -{ - echo -n "Checking CSV output: event " - perf stat -x$csv_sep -e cpu-clock -o "${stat_output}" true - commachecker --event - echo "[Success]" -} - -check_per_core() -{ - echo -n "Checking CSV output: per core " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep --per-core -a -o "${stat_output}" true - commachecker --per-core - echo "[Success]" -} - -check_per_thread() -{ - echo -n "Checking CSV output: per thread " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep --per-thread -a -o "${stat_output}" true - commachecker --per-thread - echo "[Success]" -} - -check_per_cache_instance() -{ - echo -n "Checking CSV output: per cache instance " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep --per-cache -a true 2>&1 | commachecker --per-cache - echo "[Success]" -} - -check_per_die() -{ - echo -n "Checking CSV output: per die " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep --per-die -a -o "${stat_output}" true - commachecker --per-die - echo "[Success]" -} - -check_per_node() -{ - echo -n "Checking CSV output: per node " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep --per-node -a -o "${stat_output}" true - commachecker --per-node - echo "[Success]" -} - -check_per_socket() -{ - echo -n "Checking CSV output: per socket " - if ParanoidAndNotRoot 0 - then - echo "[Skip] paranoid and not root" - return - fi - perf stat -x$csv_sep --per-socket -a -o "${stat_output}" true - commachecker --per-socket - echo "[Success]" -} - -# The perf stat options for per-socket, per-core, per-die -# and -A ( no_aggr mode ) uses the info fetched from this -# directory: "/sys/devices/system/cpu/cpu*/topology". For -# example, socket value is fetched from "physical_package_id" -# file in topology directory. -# Reference: cpu__get_topology_int in util/cpumap.c -# If the platform doesn't expose topology information, values -# will be set to -1. For example, incase of pSeries platform -# of powerpc, value for "physical_package_id" is restricted -# and set to -1. Check here validates the socket-id read from -# topology file before proceeding further - -FILE_LOC="/sys/devices/system/cpu/cpu*/topology/" -FILE_NAME="physical_package_id" - -check_for_topology() -{ - if ! ParanoidAndNotRoot 0 - then - socket_file=`ls $FILE_LOC/$FILE_NAME | head -n 1` - [ -z $socket_file ] && return 0 - socket_id=`cat $socket_file` - [ $socket_id == -1 ] && skip_test=1 - return 0 - fi -} +perf_cmd="-x$csv_sep -o ${stat_output}" -check_for_topology -check_no_args -check_system_wide -check_interval -check_event -check_per_thread -check_per_node +skip_test=$(check_for_topology) +check_no_args "CSV" "$perf_cmd" +check_system_wide "CSV" "$perf_cmd" +check_interval "CSV" "$perf_cmd" +check_event "CSV" "$perf_cmd" +check_per_thread "CSV" "$perf_cmd" +check_per_node "CSV" "$perf_cmd" if [ $skip_test -ne 1 ] then - check_system_wide_no_aggr - check_per_core - check_per_cache_instance - check_per_die - check_per_socket + check_system_wide_no_aggr "CSV" "$perf_cmd" + check_per_core "CSV" "$perf_cmd" + check_per_cache_instance "CSV" "$perf_cmd" + check_per_die "CSV" "$perf_cmd" + check_per_socket "CSV" "$perf_cmd" else echo "[Skip] Skipping tests for system_wide_no_aggr, per_core, per_die and per_socket since socket id exposed via topology is invalid" fi -- cgit v1.2.3 From 99a04a48f22504fc6319a24835e2db7934fae41e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 20:14:19 -0700 Subject: perf test: Add test case for the standard 'perf stat' output Add a new test case to verify the standard 'perf stat' output with different options. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230616031420.3751973-5-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+std_output.sh | 108 ++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100755 tools/perf/tests/shell/stat+std_output.sh diff --git a/tools/perf/tests/shell/stat+std_output.sh b/tools/perf/tests/shell/stat+std_output.sh new file mode 100755 index 000000000000..98cc3356a04a --- /dev/null +++ b/tools/perf/tests/shell/stat+std_output.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# perf stat STD output linter +# SPDX-License-Identifier: GPL-2.0 +# Tests various perf stat STD output commands for +# default event and metricgroup + +set -e + +. $(dirname $0)/lib/stat_output.sh + +stat_output=$(mktemp /tmp/__perf_test.stat_output.std.XXXXX) + +event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults cycles instructions branches branch-misses stalled-cycles-frontend stalled-cycles-backend) +event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "GHz" "insn per cycle" "/sec" "of all branches" "frontend cycles idle" "backend cycles idle") + +metricgroup_name=(TopdownL1 TopdownL2) + +cleanup() { + rm -f "${stat_output}" + + trap - EXIT TERM INT +} + +trap_cleanup() { + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + +function commachecker() +{ + local -i cnt=0 + local prefix=1 + + case "$1" + in "--interval") prefix=2 + ;; "--per-thread") prefix=2 + ;; "--system-wide-no-aggr") prefix=2 + ;; "--per-core") prefix=3 + ;; "--per-socket") prefix=3 + ;; "--per-node") prefix=3 + ;; "--per-die") prefix=3 + ;; "--per-cache") prefix=3 + esac + + while read line + do + # Ignore initial "started on" comment. + x=${line:0:1} + [ "$x" = "#" ] && continue + # Ignore initial blank line. + [ "$line" = "" ] && continue + # Ignore "Performance counter stats" + x=${line:0:25} + [ "$x" = "Performance counter stats" ] && continue + # Ignore "seconds time elapsed" and break + [[ "$line" == *"time elapsed"* ]] && break + + main_body=$(echo $line | cut -d' ' -f$prefix-) + x=${main_body%#*} + # Check default metricgroup + y=$(echo $x | tr -d ' ') + [ "$y" = "" ] && continue + for i in "${!metricgroup_name[@]}"; do + [[ "$y" == *"${metricgroup_name[$i]}"* ]] && break + done + [[ "$y" == *"${metricgroup_name[$i]}"* ]] && continue + + # Check default event + for i in "${!event_name[@]}"; do + [[ "$x" == *"${event_name[$i]}"* ]] && break + done + + [[ ! "$x" == *"${event_name[$i]}"* ]] && { + echo "Unknown event name in $line" 1>&2 + exit 1; + } + + # Check event metric if it exists + [[ ! "$main_body" == *"#"* ]] && continue + [[ ! "$main_body" == *"${event_metric[$i]}"* ]] && { + echo "wrong event metric. expected ${event_metric[$i]} in $line" 1>&2 + exit 1; + } + done < "${stat_output}" + return 0 +} + +perf_cmd="-o ${stat_output}" + +skip_test=$(check_for_topology) +check_no_args "STD" "$perf_cmd" +check_system_wide "STD" "$perf_cmd" +check_interval "STD" "$perf_cmd" +check_per_thread "STD" "$perf_cmd" +check_per_node "STD" "$perf_cmd" +if [ $skip_test -ne 1 ] +then + check_system_wide_no_aggr "STD" "$perf_cmd" + check_per_core "STD" "$perf_cmd" + check_per_cache_instance "STD" "$perf_cmd" + check_per_die "STD" "$perf_cmd" + check_per_socket "STD" "$perf_cmd" +else + echo "[Skip] Skipping tests for system_wide_no_aggr, per_core, per_die and per_socket since socket id exposed via topology is invalid" +fi +cleanup +exit 0 -- cgit v1.2.3 From f962514052aa5e8973343af1b1ba041d61424405 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 20:14:20 -0700 Subject: perf vendor events arm64: Add default tags for Hisi hip08 L1 metrics Add the default tags for Hisi hip08 as well. Signed-off-by: Kan Liang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230616031420.3751973-6-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json index 6443a061e22a..6463531b9941 100644 --- a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json +++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json @@ -3,28 +3,32 @@ "MetricExpr": "FETCH_BUBBLE / (4 * CPU_CYCLES)", "PublicDescription": "Frontend bound L1 topdown metric", "BriefDescription": "Frontend bound L1 topdown metric", - "MetricGroup": "TopDownL1", + "DefaultMetricgroupName": "TopDownL1", + "MetricGroup": "Default;TopDownL1", "MetricName": "frontend_bound" }, { "MetricExpr": "(INST_SPEC - INST_RETIRED) / (4 * CPU_CYCLES)", "PublicDescription": "Bad Speculation L1 topdown metric", "BriefDescription": "Bad Speculation L1 topdown metric", - "MetricGroup": "TopDownL1", + "DefaultMetricgroupName": "TopDownL1", + "MetricGroup": "Default;TopDownL1", "MetricName": "bad_speculation" }, { "MetricExpr": "INST_RETIRED / (CPU_CYCLES * 4)", "PublicDescription": "Retiring L1 topdown metric", "BriefDescription": "Retiring L1 topdown metric", - "MetricGroup": "TopDownL1", + "DefaultMetricgroupName": "TopDownL1", + "MetricGroup": "Default;TopDownL1", "MetricName": "retiring" }, { "MetricExpr": "1 - (frontend_bound + bad_speculation + retiring)", "PublicDescription": "Backend Bound L1 topdown metric", "BriefDescription": "Backend Bound L1 topdown metric", - "MetricGroup": "TopDownL1", + "DefaultMetricgroupName": "TopDownL1", + "MetricGroup": "Default;TopDownL1", "MetricName": "backend_bound" }, { -- cgit v1.2.3 From 66dc1920f6bbc172ee35520d024977d5330df842 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 25 Nov 2022 12:42:09 +0100 Subject: perf annotate: Work with vmlinux outside symfs It is currently possible to use --symfs along with a vmlinux which lies outside of the symfs by passing an absolute path to --vmlinux, thanks to the check in dso__load_vmlinux() which handles this explicitly. However, the annotate code lacks this check and thus 'perf annotate' does not work ("Internal error: Invalid -1 error code") for kernel functions with this combination. Add the missing handling. Signed-off-by: Vincent Whitchurch Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kernel@axis.com Link: https://lore.kernel.org/r/20221125114210.2353820-1-vincent.whitchurch@axis.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index cdd1924a4418..43865601f96c 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1731,7 +1731,10 @@ fallback: * cache, or is just a kallsyms file, well, lets hope that this * DSO is the same as when 'perf record' ran. */ - __symbol__join_symfs(filename, filename_size, dso->long_name); + if (dso->kernel && dso->long_name[0] == '/') + snprintf(filename, filename_size, "%s", dso->long_name); + else + __symbol__join_symfs(filename, filename_size, dso->long_name); mutex_lock(&dso->lock); if (access(filename, R_OK) && errno == ENOENT && dso->nsinfo) { -- cgit v1.2.3 From 6fbd67b0f067bb1708d4c1a97b1ad53750b906b2 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 16 Jun 2023 10:14:37 +0200 Subject: perf test: fix failing test cases on linux-next for s390 In linux-next tree the many test cases fail on s390x when running the perf test suite, sometime the perf tool dumps core. Output before: 6.1: Test event parsing : FAILED! 10.3: Parsing of PMU event table metrics : FAILED! 10.4: Parsing of PMU event table metrics with fake PMUs: FAILED! 17: Setup struct perf_event_attr : FAILED! 24: Number of exit events of a simple workload : FAILED! 26: Object code reading : FAILED! 28: Use a dummy software event to keep tracking : FAILED! 35: Track with sched_switch : FAILED! 42.3: BPF prologue generation : FAILED! 66: Parse and process metrics : FAILED! 68: Event expansion for cgroups : FAILED! 69.2: Perf time to TSC : FAILED! 74: build id cache operations : FAILED! 86: Zstd perf.data compression/decompression : FAILED! 87: perf record tests : FAILED! 106: Test java symbol : FAILED! The reason for all these failure is a missing PMU. On s390x the PMU is named cpum_cf which is not detected as core PMU. A similar patch was added before, see commit 9bacbced0e32204d ("perf list: Add s390 support for detailed PMU event description") which got lost during the recent reworks. Add it again. Output after: 10.2: PMU event map aliases : FAILED! 42.3: BPF prologue generation : FAILED! Most test cases now work and there is not core dump anymore. Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Ian Rogers Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20230616081437.1932003-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index fe64ad292d36..6142e4710a2f 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1419,7 +1419,7 @@ void perf_pmu__del_formats(struct list_head *formats) bool is_pmu_core(const char *name) { - return !strcmp(name, "cpu") || is_sysfs_pmu_core(name); + return !strcmp(name, "cpu") || !strcmp(name, "cpum_cf") || is_sysfs_pmu_core(name); } bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu) -- cgit v1.2.3 From ed4090a22c123b9b33368741253edddc6ff8d18f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Jun 2023 00:32:10 -0700 Subject: perf stat: Reset aggr stats for each run When it runs multiple times with -r option, it missed to reset the aggregation counters and the values were added up. The aggregation count has the values to be printed in the end. It should reset the counters at the beginning of each run. But the current code does that only when -I/--interval-print option is given. Fixes: 91f85f98da7ab8c3 ("perf stat: Display event stats using aggr counts") Reported-by: Jiri Olsa Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230616073211.1057936-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3f4e76f76f94..7029e7a7cc2e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -725,6 +725,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) all_counters_use_bpf = false; } + evlist__reset_aggr_stats(evsel_list); + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; -- cgit v1.2.3 From dada1a1f5fbccc74e9e6754fc586b1e8b82ac0af Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Jun 2023 00:32:11 -0700 Subject: perf stat: Show average value on multiple runs When -r option is used, perf stat runs the command multiple times and update stats in the evsel->stats.res_stats for global aggregation. But the value is never used and the value it prints at the end is just the value from the last run. I think we should print the average number of multiple runs. Add evlist__copy_res_stats() to update the aggr counter (for display) using the values in the evsel->stats.res_stats. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230616073211.1057936-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 5 ++++- tools/perf/util/stat.c | 22 ++++++++++++++++++++++ tools/perf/util/stat.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7029e7a7cc2e..a3c04fb265f7 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2831,8 +2831,11 @@ int cmd_stat(int argc, const char **argv) } } - if (!forever && status != -1 && (!interval || stat_config.summary)) + if (!forever && status != -1 && (!interval || stat_config.summary)) { + if (stat_config.run_count > 1) + evlist__copy_res_stats(&stat_config, evsel_list); print_counters(NULL, argc, argv); + } evlist__finalize_ctlfd(evsel_list); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 0f7b8a8cdea6..967e583392c7 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -264,6 +264,28 @@ void evlist__copy_prev_raw_counts(struct evlist *evlist) evsel__copy_prev_raw_counts(evsel); } +static void evsel__copy_res_stats(struct evsel *evsel) +{ + struct perf_stat_evsel *ps = evsel->stats; + + /* + * For GLOBAL aggregation mode, it updates the counts for each run + * in the evsel->stats.res_stats. See perf_stat_process_counter(). + */ + *ps->aggr[0].counts.values = avg_stats(&ps->res_stats); +} + +void evlist__copy_res_stats(struct perf_stat_config *config, struct evlist *evlist) +{ + struct evsel *evsel; + + if (config->aggr_mode != AGGR_GLOBAL) + return; + + evlist__for_each_entry(evlist, evsel) + evsel__copy_res_stats(evsel); +} + static size_t pkg_id_hash(long __key, void *ctx __maybe_unused) { uint64_t *key = (uint64_t *) __key; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 934f79778cea..325d0fad1842 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -197,6 +197,7 @@ void evlist__save_aggr_prev_raw_counts(struct evlist *evlist); int evlist__alloc_aggr_stats(struct evlist *evlist, int nr_aggr); void evlist__reset_aggr_stats(struct evlist *evlist); +void evlist__copy_res_stats(struct perf_stat_config *config, struct evlist *evlist); int perf_stat_process_counter(struct perf_stat_config *config, struct evsel *counter); -- cgit v1.2.3 From cddfc5fb3f91b00b5d6818d974ed46184e8762a3 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Jun 2023 10:46:58 +0530 Subject: perf pmus: Describe semantics of 'core_pmus' and 'other_pmus' Notion of 'core_pmus' and 'other_pmus' are independent of hw core and uncore pmus. For example, AMD IBS PMUs are present in each SMT-thread but they belongs to 'other_pmus'. Add a comment describing what these list contains and how they are treated. Reviewed-by: Ian Rogers Signed-off-by: Ravi Bangoria Cc: Adrian Hunter Cc: Ali Saidi Cc: Ananth Narayan Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Santosh Shukla Cc: Thomas Richter Link: https://lore.kernel.org/r/20230615051700.1833-2-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index e1d0a93147e5..8c50ab8894b7 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -12,6 +12,21 @@ #include "pmu.h" #include "print-events.h" +/* + * core_pmus: A PMU belongs to core_pmus if it's name is "cpu" or it's sysfs + * directory contains "cpus" file. All PMUs belonging to core_pmus + * must have pmu->is_core=1. If there are more than one PMU in + * this list, perf interprets it as a heterogeneous platform. + * (FWIW, certain ARM platforms having heterogeneous cores uses + * homogeneous PMU, and thus they are treated as homogeneous + * platform by perf because core_pmus will have only one entry) + * other_pmus: All other PMUs which are not part of core_pmus list. It doesn't + * matter whether PMU is present per SMT-thread or outside of the + * core in the hw. For e.g., an instance of AMD ibs_fetch// and + * ibs_op// PMUs is present in each hw SMT thread, however they + * are captured under other_pmus. PMUs belonging to other_pmus + * must have pmu->is_core=0 but pmu->is_uncore could be 0 or 1. + */ static LIST_HEAD(core_pmus); static LIST_HEAD(other_pmus); static bool read_sysfs_core_pmus; -- cgit v1.2.3 From f0dc208267bb744e3b2008bb11f68d02663330ef Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Jun 2023 10:46:59 +0530 Subject: perf mem amd: Fix perf_pmus__num_mem_pmus() perf mem/c2c on AMD internally uses IBS OP PMU, not the core PMU. Also, AMD platforms does not have heterogeneous PMUs. Reviewed-by: Ian Rogers Signed-off-by: Ravi Bangoria Cc: Adrian Hunter Cc: Ali Saidi Cc: Ananth Narayan Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Santosh Shukla Cc: Thomas Richter Link: https://lore.kernel.org/r/20230615051700.1833-3-ravi.bangoria@amd.com [ Added the improved comment for perf_pmus__num_mem_pmus() as b4 didn't from the per-patch (not series) newer version ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/pmu.c | 12 ++++++++++++ tools/perf/util/pmus.c | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c index 3c0de3370d7e..65d8cdff4d5f 100644 --- a/tools/perf/arch/x86/util/pmu.c +++ b/tools/perf/arch/x86/util/pmu.c @@ -14,6 +14,8 @@ #include "../../../util/intel-bts.h" #include "../../../util/pmu.h" #include "../../../util/fncache.h" +#include "../../../util/pmus.h" +#include "env.h" struct pmu_alias { char *name; @@ -168,3 +170,13 @@ char *pmu_find_alias_name(const char *name) return __pmu_find_alias_name(name); } + +int perf_pmus__num_mem_pmus(void) +{ + /* AMD uses IBS OP pmu and not a core PMU for perf mem/c2c */ + if (x86__is_amd_cpu()) + return 1; + + /* Intel uses core pmus for perf mem/c2c */ + return perf_pmus__num_core_pmus(); +} diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 8c50ab8894b7..a2032c1b7644 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -242,7 +242,7 @@ const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str) return NULL; } -int perf_pmus__num_mem_pmus(void) +int __weak perf_pmus__num_mem_pmus(void) { /* All core PMUs are for mem events. */ return perf_pmus__num_core_pmus(); -- cgit v1.2.3 From 5752c20f3787c9bc9ff9411a70b3d41add85518c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Jun 2023 10:47:00 +0530 Subject: perf mem: Scan all PMUs instead of just core ones Scanning only core PMUs is not sufficient on platforms like AMD since perf mem on AMD uses IBS OP PMU, which is independent of core PMU. Scan all PMUs instead of just core PMUs. There should be negligible performance overhead because of scanning all PMUs, so we should be okay. Reviewed-by: Ian Rogers Signed-off-by: Ravi Bangoria Cc: Adrian Hunter Cc: Ali Saidi Cc: Ananth Narayan Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Santosh Shukla Cc: Thomas Richter Link: https://lore.kernel.org/r/20230615051700.1833-4-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mem-events.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index be15aadb6b14..c07fe3a90722 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -130,7 +130,12 @@ int perf_mem_events__init(void) if (!e->tag) continue; - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + /* + * Scan all PMUs not just core ones, since perf mem/c2c on + * platforms like AMD uses IBS OP PMU which is independent + * of core PMU. + */ + while ((pmu = perf_pmus__scan(pmu)) != NULL) { scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); e->supported |= perf_mem_event__supported(mnt, sysfs_name); } @@ -165,7 +170,7 @@ static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, char sysfs_name[100]; struct perf_pmu *pmu = NULL; - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); if (!perf_mem_event__supported(mnt, sysfs_name)) { @@ -188,7 +193,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, if (!e->record) continue; - if (perf_pmus__num_core_pmus() == 1) { + if (perf_pmus__num_mem_pmus() == 1) { if (!e->supported) { pr_err("failed: event '%s' not supported\n", perf_mem_events__name(j, NULL)); @@ -203,7 +208,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, return -1; } - while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + while ((pmu = perf_pmus__scan(pmu)) != NULL) { rec_argv[i++] = "-e"; s = perf_mem_events__name(j, pmu->name); if (s) { -- cgit v1.2.3 From bb6b369cb42737afa4114f371868eb1d858ff36e Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 16 Jun 2023 09:56:07 +0800 Subject: perf test record+probe_libc_inet_pton.sh: Use "grep -F" instead of obsolescent "fgrep" There exists the following warning when executing 'perf test record+probe_libc_inet_pton.sh': fgrep: warning: fgrep is obsolescent; using grep -F This is tested on Fedora 38, the version of grep is 3.8, the latest version of grep claims the fgrep is obsolete, use "grep -F" instead of "fgrep" to silence the warning. Signed-off-by: Tiezhu Yang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: loongson-kernel@lists.loongnix.cn Link: https://lore.kernel.org/r/1686880567-30017-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index 0934fb0cd68f..89214a6d9951 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -14,7 +14,7 @@ . "$(dirname "$0")/lib/probe_vfs_getname.sh" libc=$(grep -w libc /proc/self/maps | head -1 | sed -r 's/.*[[:space:]](\/.*)/\1/g') -nm -Dg $libc 2>/dev/null | fgrep -q inet_pton || exit 254 +nm -Dg $libc 2>/dev/null | grep -F -q inet_pton || exit 254 event_pattern='probe_libc:inet_pton(\_[[:digit:]]+)?' @@ -94,7 +94,7 @@ delete_libc_inet_pton_event() { } # Check for IPv6 interface existence -ip a sh lo | fgrep -q inet6 || exit 2 +ip a sh lo | grep -F -q inet6 || exit 2 skip_if_no_perf_probe && \ add_libc_inet_pton_event && \ -- cgit v1.2.3 From e2be06662c1f310b60a3929f3fc944809c067307 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 16 Jun 2023 18:57:48 -0300 Subject: perf print-events: Export is_event_supported() Will be used when checking if we can encode the PMU number in perf_event_attr.type, part of the logic to use in hybrid systems (multiple types of CPUs, such as Intel's (Alder Lake, etc) or ARM's big.LITTLE). Co-developed-with: Ian Rogers Cc: James Clark Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Sumanth Korikkar Cc: Suzuki K Poulose Cc: Thomas Richter Cc: Will Deacon Link: https://lore.kernel.org/linux-perf-users/ZIzYgImv61OGK1wA@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print-events.c | 2 +- tools/perf/util/print-events.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 7a5f87392720..a7566edc86a3 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -229,7 +229,7 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state) strlist__delete(sdtlist); } -static bool is_event_supported(u8 type, u64 config) +bool is_event_supported(u8 type, u64 config) { bool ret = true; int open_return; diff --git a/tools/perf/util/print-events.h b/tools/perf/util/print-events.h index e75a3d7e3fe3..d7fab411e75c 100644 --- a/tools/perf/util/print-events.h +++ b/tools/perf/util/print-events.h @@ -3,6 +3,7 @@ #define __PERF_PRINT_EVENTS_H #include +#include #include struct event_symbol; @@ -36,5 +37,6 @@ void print_symbol_events(const struct print_callbacks *print_cb, void *print_sta unsigned int max); void print_tool_events(const struct print_callbacks *print_cb, void *print_state); void print_tracepoint_events(const struct print_callbacks *print_cb, void *print_state); +bool is_event_supported(u8 type, u64 config); #endif /* __PERF_PRINT_EVENTS_H */ -- cgit v1.2.3 From 82fe2e45cdb00de4fa648050ae33bdadf9b3294a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 16 Jun 2023 19:01:34 -0300 Subject: perf pmus: Check if we can encode the PMU number in perf_event_attr.type In some architectures we can't encode the PMU number in perf_event_attr.type and thus can't just ask for the same event in multiple CPUs (and thus PMUs), that is what we want in hybrid systems but we can't when that encoding isn't understood by the kernel, such as in ARM64's big.LITTLE. If that is the case, fallback to the previous behaviour till we find a better solution to have consistent output accross architectures with hybrid CPU configurations. Co-developed-with: Ian Rogers Cc: James Clark Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Sumanth Korikkar Cc: Suzuki K Poulose Cc: Thomas Richter Cc: Will Deacon Link: https://lore.kernel.org/linux-perf-users/ZIzYgImv61OGK1wA@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index a2032c1b7644..d891d72c824e 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "debug.h" @@ -492,9 +493,35 @@ int perf_pmus__num_core_pmus(void) return count; } +static bool __perf_pmus__supports_extended_type(void) +{ + struct perf_pmu *pmu = NULL; + + if (perf_pmus__num_core_pmus() <= 1) + return false; + + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + if (!is_event_supported(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES | ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT))) + return false; + } + + return true; +} + +static bool perf_pmus__do_support_extended_type; + +static void perf_pmus__init_supports_extended_type(void) +{ + perf_pmus__do_support_extended_type = __perf_pmus__supports_extended_type(); +} + bool perf_pmus__supports_extended_type(void) { - return perf_pmus__num_core_pmus() > 1; + static pthread_once_t extended_type_once = PTHREAD_ONCE_INIT; + + pthread_once(&extended_type_once, perf_pmus__init_supports_extended_type); + + return perf_pmus__do_support_extended_type; } struct perf_pmu *evsel__find_pmu(const struct evsel *evsel) -- cgit v1.2.3 From db1f5f1038a2df67f549e2657f56327e28127c27 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Wed, 14 Jun 2023 02:15:05 +0000 Subject: perf stat: Add missing newline in pr_err messages The newline is missing for error messages in add_default_attributes() Before: # perf stat --topdown Topdown requested but the topdown metric groups aren't present. (See perf list the metric groups have names like TopdownL1)# After: # perf stat --topdown Topdown requested but the topdown metric groups aren't present. (See perf list the metric groups have names like TopdownL1) # In addition, perf_stat_init_aggr_mode() and perf_stat_init_aggr_mode_file() have the same problem, fixed by the way. Signed-off-by: Yang Jihong Acked-by: Ian Rogers Reviewed-by: James Clark Link: https://lore.kernel.org/r/20230614021505.59856-1-yangjihong1@huawei.com Signed-off-by: Namhyung Kim --- tools/perf/builtin-stat.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a3c04fb265f7..07b48f6df48e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1596,7 +1596,7 @@ static int perf_stat_init_aggr_mode(void) stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus, get_id, /*data=*/NULL, needs_sort); if (!stat_config.aggr_map) { - pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]); + pr_err("cannot build %s map\n", aggr_mode__string[stat_config.aggr_mode]); return -1; } stat_config.aggr_get_id = aggr_mode__get_id(stat_config.aggr_mode); @@ -1912,7 +1912,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus, get_id, env, needs_sort); if (!stat_config.aggr_map) { - pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]); + pr_err("cannot build %s map\n", aggr_mode__string[stat_config.aggr_mode]); return -1; } stat_config.aggr_get_id = aggr_mode__get_id_file(stat_config.aggr_mode); @@ -2052,7 +2052,7 @@ static int add_default_attributes(void) * on an architecture test for such a metric name. */ if (!metricgroup__has_metric(pmu, "transaction")) { - pr_err("Missing transaction metrics"); + pr_err("Missing transaction metrics\n"); return -1; } return metricgroup__parse_groups(evsel_list, pmu, "transaction", @@ -2068,7 +2068,7 @@ static int add_default_attributes(void) int smi; if (sysfs__read_int(FREEZE_ON_SMI_PATH, &smi) < 0) { - pr_err("freeze_on_smi is not supported."); + pr_err("freeze_on_smi is not supported.\n"); return -1; } @@ -2081,7 +2081,7 @@ static int add_default_attributes(void) } if (!metricgroup__has_metric(pmu, "smi")) { - pr_err("Missing smi metrics"); + pr_err("Missing smi metrics\n"); return -1; } @@ -2106,7 +2106,7 @@ static int add_default_attributes(void) if (!max_level) { pr_err("Topdown requested but the topdown metric groups aren't present.\n" - "(See perf list the metric groups have names like TopdownL1)"); + "(See perf list the metric groups have names like TopdownL1)\n"); return -1; } if (stat_config.topdown_level > max_level) { -- cgit v1.2.3 From bc06026d1420e006503c69dc6829cc45590db106 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Fri, 16 Jun 2023 02:45:15 +0000 Subject: perf parse: Add missing newline to pr_debug message in evsel__compute_group_pmu_name() The newline is missing for pr_debug message in evsel__compute_group_pmu_name(), fix it. Before: # perf --debug verbose=2 record -e cpu-clock true No PMU found for 'cycles:u'No PMU found for 'instructions:u'------------------------------------------------------------ perf_event_attr: type 1 size 136 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|PERIOD read_format ID|LOST disabled 1 inherit 1 mmap 1 comm 1 freq 1 enable_on_exec 1 task 1 sample_id_all 1 exclude_guest 1 mmap2 1 comm_exec 1 ksymbol 1 bpf_event 1 ------------------------------------------------------------ After: # perf --debug verbose=2 record -e cpu-clock true No PMU found for 'cycles:u' No PMU found for 'instructions:u' ------------------------------------------------------------ perf_event_attr: type 1 size 136 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|PERIOD read_format ID|LOST disabled 1 inherit 1 mmap 1 comm 1 freq 1 enable_on_exec 1 task 1 sample_id_all 1 exclude_guest 1 mmap2 1 comm_exec 1 ksymbol 1 bpf_event 1 ------------------------------------------------------------ Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: mark.rutland@arm.com Cc: irogers@google.com Cc: peterz@infradead.org Cc: adrian.hunter@intel.com Cc: acme@kernel.org Cc: jolsa@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: kan.liang@linux.intel.com Cc: mingo@redhat.com Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20230616024515.80814-1-yangjihong1@huawei.com Signed-off-by: Namhyung Kim --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2d36cadf35ec..bc7274641f34 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2036,7 +2036,7 @@ static int evsel__compute_group_pmu_name(struct evsel *evsel, pmu = perf_pmus__scan_core(NULL); } if (!pmu) { - pr_debug("No PMU found for '%s'", evsel__name(evsel)); + pr_debug("No PMU found for '%s'\n", evsel__name(evsel)); return -EINVAL; } group_pmu_name = pmu->name; -- cgit v1.2.3 From 240de691dd6684d15f63613aa7fbc64e6098d6c9 Mon Sep 17 00:00:00 2001 From: "baomingtong001@208suo.com" Date: Wed, 14 Jun 2023 16:13:53 +0800 Subject: perf parse-events: Remove unneeded semicolon ./tools/perf/util/parse-events.c:1466:2-3: Unneeded semicolon Signed-off-by: Mingtong Bao Link: https://lore.kernel.org/r/2c733a91717eae93119ba2226420fd8f@208suo.com Signed-off-by: Namhyung Kim --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index bc7274641f34..5dcfbf316bf6 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1478,7 +1478,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state, if (extended_type && (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)) { assert(perf_pmus__supports_extended_type()); attr.config |= (u64)extended_type << PERF_PMU_TYPE_SHIFT; - }; + } if (head_config) { if (config_attr(&attr, head_config, parse_state->error, -- cgit v1.2.3 From 53fc25b7f557089aff101235152ae4bff15c428a Mon Sep 17 00:00:00 2001 From: Chenyuan Mi Date: Wed, 14 Jun 2023 08:01:18 -0700 Subject: perf subcmd: Fix missing check for return value of malloc() in add_cmdname() The malloc() function may return NULL when it fails, which may cause null pointer deference in add_cmdname(), add Null check for return value of malloc(). Found by our static analysis tool. Signed-off-by: Chenyuan Mi Acked-by: Ian Rogers Cc: Leo Yan Cc: Arnaldo Carvalho de Melo Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/20230614150118.115208-1-cymi20@fudan.edu.cn Signed-off-by: Namhyung Kim --- tools/lib/subcmd/help.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index a66fb1a1a312..67a8d6b740ea 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -16,6 +16,8 @@ void add_cmdname(struct cmdnames *cmds, const char *name, size_t len) { struct cmdname *ent = malloc(sizeof(*ent) + len + 1); + if (!ent) + return; ent->len = len; memcpy(ent->name, name, len); -- cgit v1.2.3 From 5e37ef5c2a5303d41842b8277770064632533318 Mon Sep 17 00:00:00 2001 From: Li Dong Date: Mon, 19 Jun 2023 16:20:10 +0800 Subject: tools: Fix incorrect calculation of object size by sizeof What we need to calculate is the size of the object, not the size of the pointer. Fixed: 51cfe7a3e87e ("perf python: Avoid 2 leak sanitizer issues") Signed-off-by: Li Dong Acked-by: Namhyung Kim Cc: Mark Rutland Cc: Ian Rogers Cc: Sean Christopherson Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Ingo Molnar Cc: opensource.kernel@vivo.com Link: https://lore.kernel.org/r/20230619082036.410-1-lidong@vivo.com Signed-off-by: Namhyung Kim --- tools/perf/util/scripting-engines/trace-event-python.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 59063ec98619..25fcd6630a4d 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -771,12 +771,12 @@ static void set_regs_in_dict(PyObject *dict, int size = __sw_hweight64(attr->sample_regs_intr) * 28; char *bf = malloc(size); - regs_map(&sample->intr_regs, attr->sample_regs_intr, arch, bf, sizeof(bf)); + regs_map(&sample->intr_regs, attr->sample_regs_intr, arch, bf, size); pydict_set_item_string_decref(dict, "iregs", _PyUnicode_FromString(bf)); - regs_map(&sample->user_regs, attr->sample_regs_user, arch, bf, sizeof(bf)); + regs_map(&sample->user_regs, attr->sample_regs_user, arch, bf, size); pydict_set_item_string_decref(dict, "uregs", _PyUnicode_FromString(bf)); -- cgit v1.2.3 From 0650b2b2e62edfa9510ba0c80f42d98c4a748b12 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 14 Jun 2023 21:07:14 -0700 Subject: perf sharded_mutex: Introduce sharded_mutex Per object mutexes may come with significant memory cost while a global mutex can suffer from unnecessary contention. A sharded mutex is a compromise where objects are hashed and then a particular mutex for the hash of the object used. Contention can be controlled by the number of shards. v2. Use hashmap.h's hash_bits in case of contention from alignment of objects. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Andres Freund Cc: Mark Rutland Cc: Yuan Can Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Huacai Chen Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Alexander Shishkin Cc: Kan Liang Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230615040715.2064350-1-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/Build | 1 + tools/perf/util/sharded_mutex.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/sharded_mutex.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 tools/perf/util/sharded_mutex.c create mode 100644 tools/perf/util/sharded_mutex.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index ff2fd1a36bb8..96f4ea1d45c5 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -145,6 +145,7 @@ perf-y += mem2node.o perf-y += clockid.o perf-y += list_sort.o perf-y += mutex.o +perf-y += sharded_mutex.o perf-$(CONFIG_LIBBPF) += bpf-loader.o perf-$(CONFIG_LIBBPF) += bpf_map.o diff --git a/tools/perf/util/sharded_mutex.c b/tools/perf/util/sharded_mutex.c new file mode 100644 index 000000000000..e11e8d0945a7 --- /dev/null +++ b/tools/perf/util/sharded_mutex.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "sharded_mutex.h" + +#include + +struct sharded_mutex *sharded_mutex__new(size_t num_shards) +{ + struct sharded_mutex *result; + size_t size; + unsigned int bits; + + for (bits = 0; ((size_t)1 << bits) < num_shards; bits++) + ; + + size = sizeof(*result) + sizeof(struct mutex) * (1 << bits); + result = malloc(size); + if (!result) + return NULL; + + result->cap_bits = bits; + for (size_t i = 0; i < ((size_t)1 << bits); i++) + mutex_init(&result->mutexes[i]); + + return result; +} + +void sharded_mutex__delete(struct sharded_mutex *sm) +{ + for (size_t i = 0; i < ((size_t)1 << sm->cap_bits); i++) + mutex_destroy(&sm->mutexes[i]); + + free(sm); +} diff --git a/tools/perf/util/sharded_mutex.h b/tools/perf/util/sharded_mutex.h new file mode 100644 index 000000000000..7325e969eee3 --- /dev/null +++ b/tools/perf/util/sharded_mutex.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef PERF_SHARDED_MUTEX_H +#define PERF_SHARDED_MUTEX_H + +#include "mutex.h" +#include "hashmap.h" + +/* + * In a situation where a lock is needed per object, having a mutex can be + * relatively memory expensive (40 bytes on x86-64). If the object can be + * constantly hashed, a sharded mutex is an alternative global pool of mutexes + * where the mutex is looked up from a hash value. This can lead to collisions + * if the number of shards isn't large enough. + */ +struct sharded_mutex { + /* mutexes array is 1<mutexes[hash_bits(hash, sm->cap_bits)]; +} + +#endif /* PERF_SHARDED_MUTEX_H */ -- cgit v1.2.3 From 2e9f9d4a729f12b4bc3fa60406374327b1809abe Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 14 Jun 2023 21:07:15 -0700 Subject: perf annotation: Switch lock from a mutex to a sharded_mutex Remove the "struct mutex lock" variable from annotation that is allocated per symbol. This removes in the region of 40 bytes per symbol allocation. Use a sharded mutex where the number of shards is set to the number of CPUs. Assuming good hashing of the annotation (done based on the pointer), this means in order to contend there needs to be more threads than CPUs, which is not currently true in any perf command. Were contention an issue it is straightforward to increase the number of shards in the mutex. On my Debian/glibc based machine, this reduces the size of struct annotation from 136 bytes to 96 bytes, or nearly 30%. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Andres Freund Cc: Mark Rutland Cc: Yuan Can Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Huacai Chen Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Alexander Shishkin Cc: Kan Liang Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230615040715.2064350-2-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/builtin-top.c | 14 ++++----- tools/perf/ui/browsers/annotate.c | 10 +++--- tools/perf/util/annotate.c | 66 +++++++++++++++++++++++++++++++++------ tools/perf/util/annotate.h | 11 +++++-- 4 files changed, 77 insertions(+), 24 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c363c04e16df..1baa2acb3ced 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -137,10 +137,10 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) } notes = symbol__annotation(sym); - mutex_lock(¬es->lock); + annotation__lock(notes); if (!symbol__hists(sym, top->evlist->core.nr_entries)) { - mutex_unlock(¬es->lock); + annotation__unlock(notes); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); sleep(1); @@ -156,7 +156,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) pr_err("Couldn't annotate %s: %s\n", sym->name, msg); } - mutex_unlock(¬es->lock); + annotation__unlock(notes); return err; } @@ -211,12 +211,12 @@ static void perf_top__record_precise_ip(struct perf_top *top, notes = symbol__annotation(sym); - if (!mutex_trylock(¬es->lock)) + if (!annotation__trylock(notes)) return; err = hist_entry__inc_addr_samples(he, sample, evsel, ip); - mutex_unlock(¬es->lock); + annotation__unlock(notes); if (unlikely(err)) { /* @@ -253,7 +253,7 @@ static void perf_top__show_details(struct perf_top *top) symbol = he->ms.sym; notes = symbol__annotation(symbol); - mutex_lock(¬es->lock); + annotation__lock(notes); symbol__calc_percent(symbol, evsel); @@ -274,7 +274,7 @@ static void perf_top__show_details(struct perf_top *top) if (more != 0) printf("%d lines not displayed, maybe increase display entries [e]\n", more); out_unlock: - mutex_unlock(¬es->lock); + annotation__unlock(notes); } static void perf_top__resort_hists(struct perf_top *t) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 70bad42b807b..ccdb2cd11fbf 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -314,7 +314,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, browser->entries = RB_ROOT; - mutex_lock(¬es->lock); + annotation__lock(notes); symbol__calc_percent(sym, evsel); @@ -343,7 +343,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, } disasm_rb_tree__insert(browser, &pos->al); } - mutex_unlock(¬es->lock); + annotation__unlock(notes); browser->curr_hot = rb_last(&browser->entries); } @@ -470,10 +470,10 @@ static bool annotate_browser__callq(struct annotate_browser *browser, } notes = symbol__annotation(dl->ops.target.sym); - mutex_lock(¬es->lock); + annotation__lock(notes); if (!symbol__hists(dl->ops.target.sym, evsel->evlist->core.nr_entries)) { - mutex_unlock(¬es->lock); + annotation__unlock(notes); ui__warning("Not enough memory for annotating '%s' symbol!\n", dl->ops.target.sym->name); return true; @@ -482,7 +482,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser, target_ms.maps = ms->maps; target_ms.map = ms->map; target_ms.sym = dl->ops.target.sym; - mutex_unlock(¬es->lock); + annotation__unlock(notes); symbol__tui_annotate(&target_ms, evsel, hbt, browser->opts); sym_title(ms->sym, ms->map, title, sizeof(title), browser->opts->percent_type); ui_browser__show_title(&browser->b, title); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 43865601f96c..77c816400719 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -32,6 +32,7 @@ #include "block-range.h" #include "string2.h" #include "util/event.h" +#include "util/sharded_mutex.h" #include "arch/common.h" #include "namespaces.h" #include @@ -856,7 +857,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); - mutex_lock(¬es->lock); + annotation__lock(notes); if (notes->src != NULL) { memset(notes->src->histograms, 0, notes->src->nr_histograms * notes->src->sizeof_sym_hist); @@ -864,7 +865,7 @@ void symbol__annotate_zero_histograms(struct symbol *sym) memset(notes->src->cycles_hist, 0, symbol__size(sym) * sizeof(struct cyc_hist)); } - mutex_unlock(¬es->lock); + annotation__unlock(notes); } static int __symbol__account_cycles(struct cyc_hist *ch, @@ -1121,7 +1122,7 @@ void annotation__compute_ipc(struct annotation *notes, size_t size) notes->hit_insn = 0; notes->cover_insn = 0; - mutex_lock(¬es->lock); + annotation__lock(notes); for (offset = size - 1; offset >= 0; --offset) { struct cyc_hist *ch; @@ -1140,7 +1141,7 @@ void annotation__compute_ipc(struct annotation *notes, size_t size) notes->have_cycles = true; } } - mutex_unlock(¬es->lock); + annotation__unlock(notes); } int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, @@ -1291,17 +1292,64 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r return ins__scnprintf(&dl->ins, bf, size, &dl->ops, max_ins_name); } -void annotation__init(struct annotation *notes) +void annotation__exit(struct annotation *notes) { - mutex_init(¬es->lock); + annotated_source__delete(notes->src); } -void annotation__exit(struct annotation *notes) +static struct sharded_mutex *sharded_mutex; + +static void annotation__init_sharded_mutex(void) { - annotated_source__delete(notes->src); - mutex_destroy(¬es->lock); + /* As many mutexes as there are CPUs. */ + sharded_mutex = sharded_mutex__new(cpu__max_present_cpu().cpu); +} + +static size_t annotation__hash(const struct annotation *notes) +{ + return (size_t)notes; } +static struct mutex *annotation__get_mutex(const struct annotation *notes) +{ + static pthread_once_t once = PTHREAD_ONCE_INIT; + + pthread_once(&once, annotation__init_sharded_mutex); + if (!sharded_mutex) + return NULL; + + return sharded_mutex__get_mutex(sharded_mutex, annotation__hash(notes)); +} + +void annotation__lock(struct annotation *notes) + NO_THREAD_SAFETY_ANALYSIS +{ + struct mutex *mutex = annotation__get_mutex(notes); + + if (mutex) + mutex_lock(mutex); +} + +void annotation__unlock(struct annotation *notes) + NO_THREAD_SAFETY_ANALYSIS +{ + struct mutex *mutex = annotation__get_mutex(notes); + + if (mutex) + mutex_unlock(mutex); +} + +bool annotation__trylock(struct annotation *notes) +{ + struct mutex *mutex = annotation__get_mutex(notes); + + if (!mutex) + return false; + + return mutex_trylock(mutex); +} + + static void annotation_line__add(struct annotation_line *al, struct list_head *head) { list_add_tail(&al->node, head); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 1c6335b8333a..962780559176 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -271,8 +271,7 @@ struct annotated_source { struct sym_hist *histograms; }; -struct annotation { - struct mutex lock; +struct LOCKABLE annotation { u64 max_coverage; u64 start; u64 hit_cycles; @@ -298,9 +297,15 @@ struct annotation { struct annotated_source *src; }; -void annotation__init(struct annotation *notes); +static inline void annotation__init(struct annotation *notes __maybe_unused) +{ +} void annotation__exit(struct annotation *notes); +void annotation__lock(struct annotation *notes) EXCLUSIVE_LOCK_FUNCTION(*notes); +void annotation__unlock(struct annotation *notes) UNLOCK_FUNCTION(*notes); +bool annotation__trylock(struct annotation *notes) EXCLUSIVE_TRYLOCK_FUNCTION(true, *notes); + static inline int annotation__cycles_width(struct annotation *notes) { if (notes->have_cycles && notes->options->show_minmax_cycle) -- cgit v1.2.3 From 4ca0d340ce206985d9b9956993d7c81eeb1d3198 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Tue, 20 Jun 2023 21:20:25 +0800 Subject: perf annotate: Fix instruction association and parsing for LoongArch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the perf annotate view for LoongArch, there is no arrowed line pointing to the target from the branch instruction. This issue is caused by incorrect instruction association and parsing. $ perf record alloc-6276705c94ad1398 # rust benchmark $ perf report 0.28 │ ori $a1, $zero, 0x63 │ move $a2, $zero 10.55 │ addi.d $a3, $a2, 1(0x1) │ sltu $a4, $a3, $s7 9.53 │ masknez $a4, $s7, $a4 │ sub.d $a3, $a3, $a4 12.12 │ st.d $a1, $fp, 24(0x18) │ st.d $a3, $fp, 16(0x10) 16.29 │ slli.d $a2, $a2, 0x2 │ ldx.w $a2, $s8, $a2 12.77 │ st.w $a2, $sp, 724(0x2d4) │ st.w $s0, $sp, 720(0x2d0) 7.03 │ addi.d $a2, $sp, 720(0x2d0) │ addi.d $a1, $a1, -1(0xfff) 12.03 │ move $a2, $a3 │ → bne $a1, $s3, -52(0x3ffcc) # 82ce8 2.50 │ addi.d $a0, $a0, 1(0x1) This patch fixes instruction association issues, such as associating branch instructions with jump_ops instead of call_ops, and corrects false instruction matches. It also implements branch instruction parsing specifically for LoongArch. With this patch, we will be able to see the arrowed line. 0.79 │3ec: ori $a1, $zero, 0x63 │ move $a2, $zero 10.32 │3f4:┌─→addi.d $a3, $a2, 1(0x1) │ │ sltu $a4, $a3, $s7 10.44 │ │ masknez $a4, $s7, $a4 │ │ sub.d $a3, $a3, $a4 14.17 │ │ st.d $a1, $fp, 24(0x18) │ │ st.d $a3, $fp, 16(0x10) 13.15 │ │ slli.d $a2, $a2, 0x2 │ │ ldx.w $a2, $s8, $a2 11.00 │ │ st.w $a2, $sp, 724(0x2d4) │ │ st.w $s0, $sp, 720(0x2d0) 8.00 │ │ addi.d $a2, $sp, 720(0x2d0) │ │ addi.d $a1, $a1, -1(0xfff) 11.99 │ │ move $a2, $a3 │ └──bne $a1, $s3, 3f4 3.17 │ addi.d $a0, $a0, 1(0x1) Signed-off-by: WANG Rui Acked-by: Namhyung Kim Cc: Mark Rutland Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: loongarch@lists.linux.dev Cc: loongson-kernel@lists.loongnix.cn Cc: Huacai Chen Cc: Tiezhu Yang Cc: Ingo Molnar Cc: WANG Xuerui Link: https://lore.kernel.org/r/20230620132025.105563-1-wangrui@loongson.cn Signed-off-by: Namhyung Kim --- tools/perf/arch/loongarch/annotate/instructions.c | 116 +++++++++++++++++++--- tools/perf/arch/s390/annotate/instructions.c | 3 - tools/perf/util/annotate.c | 8 +- 3 files changed, 109 insertions(+), 18 deletions(-) diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c index ab21bf122135..98e19c5366ac 100644 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -5,25 +5,115 @@ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ +static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + char *c, *endptr, *tok, *name; + struct map *map = ms->map; + struct addr_map_symbol target = { + .ms = { .map = map, }, + }; + + c = strchr(ops->raw, '#'); + if (c++ == NULL) + return -1; + + ops->target.addr = strtoull(c, &endptr, 16); + + name = strchr(endptr, '<'); + name++; + + if (arch->objdump.skip_functions_char && + strchr(name, arch->objdump.skip_functions_char)) + return -1; + + tok = strchr(name, '>'); + if (tok == NULL) + return -1; + + *tok = '\0'; + ops->target.name = strdup(name); + *tok = '>'; + + if (ops->target.name == NULL) + return -1; + + target.addr = map__objdump_2mem(map, ops->target.addr); + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + return 0; +} + +static struct ins_ops loongarch_call_ops = { + .parse = loongarch_call__parse, + .scnprintf = call__scnprintf, +}; + +static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +{ + struct map *map = ms->map; + struct symbol *sym = ms->sym; + struct addr_map_symbol target = { + .ms = { .map = map, }, + }; + const char *c = strchr(ops->raw, '#'); + u64 start, end; + + ops->raw_comment = strchr(ops->raw, arch->objdump.comment_char); + ops->raw_func_start = strchr(ops->raw, '<'); + + if (ops->raw_func_start && c > ops->raw_func_start) + c = NULL; + + if (c++ != NULL) + ops->target.addr = strtoull(c, NULL, 16); + else + ops->target.addr = strtoull(ops->raw, NULL, 16); + + target.addr = map__objdump_2mem(map, ops->target.addr); + start = map__unmap_ip(map, sym->start); + end = map__unmap_ip(map, sym->end); + + ops->target.outside = target.addr < start || target.addr > end; + + if (maps__find_ams(ms->maps, &target) == 0 && + map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr) + ops->target.sym = target.ms.sym; + + if (!ops->target.outside) { + ops->target.offset = target.addr - start; + ops->target.offset_avail = true; + } else { + ops->target.offset_avail = false; + } + + return 0; +} + +static struct ins_ops loongarch_jump_ops = { + .parse = loongarch_jump__parse, + .scnprintf = jump__scnprintf, +}; + static struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name) { struct ins_ops *ops = NULL; - if (!strncmp(name, "beqz", 4) || - !strncmp(name, "bnez", 4) || - !strncmp(name, "beq", 3) || - !strncmp(name, "bne", 3) || - !strncmp(name, "blt", 3) || - !strncmp(name, "bge", 3) || - !strncmp(name, "bltu", 4) || - !strncmp(name, "bgeu", 4) || - !strncmp(name, "bl", 2)) - ops = &call_ops; - else if (!strncmp(name, "jirl", 4)) + if (!strcmp(name, "bl")) + ops = &loongarch_call_ops; + else if (!strcmp(name, "jirl")) ops = &ret_ops; - else if (name[0] == 'b') - ops = &jump_ops; + else if (!strcmp(name, "b") || + !strncmp(name, "beq", 3) || + !strncmp(name, "bne", 3) || + !strncmp(name, "blt", 3) || + !strncmp(name, "bge", 3) || + !strncmp(name, "bltu", 4) || + !strncmp(name, "bgeu", 4)) + ops = &loongarch_jump_ops; else return NULL; diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index de925b0e35ce..da5aa3e1f04c 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -45,9 +45,6 @@ static int s390_call__parse(struct arch *arch, struct ins_operands *ops, return 0; } -static int call__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); - static struct ins_ops s390_call_ops = { .parse = s390_call__parse, .scnprintf = call__scnprintf, diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 77c816400719..ba988a13dacb 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -62,6 +62,10 @@ static regex_t file_lineno; static struct ins_ops *ins__find(struct arch *arch, const char *name); static void ins__sort(struct arch *arch); static int disasm_line__parse(char *line, const char **namep, char **rawp); +static int call__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); +static int jump__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name); struct arch { const char *name; @@ -324,7 +328,7 @@ static struct ins_ops call_ops = { bool ins__is_call(const struct ins *ins) { - return ins->ops == &call_ops || ins->ops == &s390_call_ops; + return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops; } /* @@ -465,7 +469,7 @@ static struct ins_ops jump_ops = { bool ins__is_jump(const struct ins *ins) { - return ins->ops == &jump_ops; + return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops; } static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) -- cgit v1.2.3 From 362f9c907fd8c2be3d5c5686ea787bca25443cdc Mon Sep 17 00:00:00 2001 From: elisabeth Date: Fri, 2 Jun 2023 14:38:15 +0200 Subject: perf jit: Fix incorrect file name in DWARF line table Fixes an issue where an incorrect filename was added in the DWARF line table of an ELF object file when calling 'perf inject --jit' due to not checking the filename of a debug entry against the repeated name marker (/xff/0). The marker is mentioned in the tools/perf/util/jitdump.h header, which describes the jitdump binary format, and indicitates that the filename in a debug entry is the same as the previous enrty. In the function emit_lineno_info(), in the file tools/perf/util/genelf-debug.c, the debug entry filename gets compared to the previous entry filename. If they are not the same, a new filename is added to the DWARF line table. However, since there is no check against '\xff\0', in some cases '\xff\0' is inserted as the filename into the DWARF line table. This can be seen with `objdump --dwarf=line` on the ELF file after `perf inject --jit`. It also makes no source code information show up in 'perf annotate'. Signed-off-by: Elisabeth Panholzer Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20230602123815.255001-1-paniii94@gmail.com [ Fixed a trailing white space, removed a subject prefix ] Signed-off-by: Namhyung Kim --- tools/perf/util/genelf_debug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c index aa5dcc56b2ac..8588b3e35e00 100644 --- a/tools/perf/util/genelf_debug.c +++ b/tools/perf/util/genelf_debug.c @@ -337,6 +337,9 @@ static void emit_lineno_info(struct buffer_ext *be, { size_t i; + /* as described in the jitdump format */ + const char repeated_name_marker[] = {'\xff', '\0'}; + /* * Machine state at start of a statement program * address = 0 @@ -363,7 +366,8 @@ static void emit_lineno_info(struct buffer_ext *be, /* * check if filename changed, if so add it */ - if (!cur_filename || strcmp(cur_filename, ent->name)) { + if ((!cur_filename || strcmp(cur_filename, ent->name)) && + strcmp(repeated_name_marker, ent->name)) { emit_lne_define_filename(be, ent->name); cur_filename = ent->name; emit_set_file(be, ++cur_file_idx); -- cgit v1.2.3 From 3ad7092f5145aab4118f575b57f0ab1707b1cd36 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Tue, 20 Jun 2023 10:00:25 -0700 Subject: perf test: Add metric value validation test Add metric value validation test to check if metric values are with in correct value ranges. There are three types of tests included: 1) positive-value test checks if all the metrics collected are non-negative; 2) single-value test checks if the list of metrics have values in given value ranges; 3) relationship test checks if multiple metrics follow a given relationship, e.g. memory_bandwidth_read + memory_bandwidth_write = memory_bandwidth_total. Signed-off-by: Weilin Wang Tested-by: Namhyung Kim Cc: ravi.bangoria@amd.com Cc: Ian Rogers Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Caleb Biggers Cc: Perry Taylor Cc: Samantha Alt Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230620170027.1861012-2-weilin.wang@intel.com Signed-off-by: Namhyung Kim --- .../perf/tests/shell/lib/perf_metric_validation.py | 514 +++++++++++++++++++++ .../shell/lib/perf_metric_validation_rules.json | 387 ++++++++++++++++ tools/perf/tests/shell/stat_metrics_values.sh | 30 ++ 3 files changed, 931 insertions(+) create mode 100644 tools/perf/tests/shell/lib/perf_metric_validation.py create mode 100644 tools/perf/tests/shell/lib/perf_metric_validation_rules.json create mode 100755 tools/perf/tests/shell/stat_metrics_values.sh diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py new file mode 100644 index 000000000000..81bd2bf38b67 --- /dev/null +++ b/tools/perf/tests/shell/lib/perf_metric_validation.py @@ -0,0 +1,514 @@ +#SPDX-License-Identifier: GPL-2.0 +import re +import csv +import json +import argparse +from pathlib import Path +import subprocess + +class Validator: + def __init__(self, rulefname, reportfname='', t=5, debug=False, datafname='', fullrulefname='', workload='true', metrics=''): + self.rulefname = rulefname + self.reportfname = reportfname + self.rules = None + self.collectlist=metrics + self.metrics = set() + self.tolerance = t + + self.workloads = [x for x in workload.split(",") if x] + self.wlidx = 0 # idx of current workloads + self.allresults = dict() # metric results of all workload + self.allignoremetrics = dict() # metrics with no results or negative results + self.allfailtests = dict() + self.alltotalcnt = dict() + self.allpassedcnt = dict() + self.allerrlist = dict() + + self.results = dict() # metric results of current workload + # vars for test pass/failure statistics + self.ignoremetrics= set() # metrics with no results or negative results, neg result counts as a failed test + self.failtests = dict() + self.totalcnt = 0 + self.passedcnt = 0 + # vars for errors + self.errlist = list() + + # vars for Rule Generator + self.pctgmetrics = set() # Percentage rule + + # vars for debug + self.datafname = datafname + self.debug = debug + self.fullrulefname = fullrulefname + + def read_json(self, filename: str) -> dict: + try: + with open(Path(filename).resolve(), "r") as f: + data = json.loads(f.read()) + except OSError as e: + print(f"Error when reading file {e}") + sys.exit() + + return data + + def json_dump(self, data, output_file): + parent = Path(output_file).parent + if not parent.exists(): + parent.mkdir(parents=True) + + with open(output_file, "w+") as output_file: + json.dump(data, + output_file, + ensure_ascii=True, + indent=4) + + def get_results(self, idx:int = 0): + return self.results[idx] + + def get_bounds(self, lb, ub, error, alias={}, ridx:int = 0) -> list: + """ + Get bounds and tolerance from lb, ub, and error. + If missing lb, use 0.0; missing ub, use float('inf); missing error, use self.tolerance. + + @param lb: str/float, lower bound + @param ub: str/float, upper bound + @param error: float/str, error tolerance + @returns: lower bound, return inf if the lower bound is a metric value and is not collected + upper bound, return -1 if the upper bound is a metric value and is not collected + tolerance, denormalized base on upper bound value + """ + # init ubv and lbv to invalid values + def get_bound_value (bound, initval, ridx): + val = initval + if isinstance(bound, int) or isinstance(bound, float): + val = bound + elif isinstance(bound, str): + if bound == '': + val = float("inf") + elif bound in alias: + vall = self.get_value(alias[ub], ridx) + if vall: + val = vall[0] + elif bound.replace('.', '1').isdigit(): + val = float(bound) + else: + print("Wrong bound: {0}".format(bound)) + else: + print("Wrong bound: {0}".format(bound)) + return val + + ubv = get_bound_value(ub, -1, ridx) + lbv = get_bound_value(lb, float('inf'), ridx) + t = get_bound_value(error, self.tolerance, ridx) + + # denormalize error threshold + denormerr = t * ubv / 100 if ubv != 100 and ubv > 0 else t + + return lbv, ubv, denormerr + + def get_value(self, name:str, ridx:int = 0) -> list: + """ + Get value of the metric from self.results. + If result of this metric is not provided, the metric name will be added into self.ignoremetics and self.errlist. + All future test(s) on this metric will fail. + + @param name: name of the metric + @returns: list with value found in self.results; list is empty when not value found. + """ + results = [] + data = self.results[ridx] if ridx in self.results else self.results[0] + if name not in self.ignoremetrics: + if name in data: + results.append(data[name]) + elif name.replace('.', '1').isdigit(): + results.append(float(name)) + else: + self.errlist.append("Metric '%s' is not collected or the value format is incorrect"%(name)) + self.ignoremetrics.add(name) + return results + + def check_bound(self, val, lb, ub, err): + return True if val <= ub + err and val >= lb - err else False + + # Positive Value Sanity check + def pos_val_test(self): + """ + Check if metrics value are non-negative. + One metric is counted as one test. + Failure: when metric value is negative or not provided. + Metrics with negative value will be added into the self.failtests['PositiveValueTest'] and self.ignoremetrics. + """ + negmetric = set() + missmetric = set() + pcnt = 0 + tcnt = 0 + for name, val in self.get_results().items(): + if val is None or val == '': + missmetric.add(name) + self.errlist.append("Metric '%s' is not collected"%(name)) + elif val < 0: + negmetric.add("{0}(={1:.4f})".format(name, val)) + else: + pcnt += 1 + tcnt += 1 + + self.failtests['PositiveValueTest']['Total Tests'] = tcnt + self.failtests['PositiveValueTest']['Passed Tests'] = pcnt + if len(negmetric) or len(missmetric)> 0: + self.ignoremetrics.update(negmetric) + self.ignoremetrics.update(missmetric) + self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue':list(negmetric), 'MissingValue':list(missmetric)}) + + return + + def evaluate_formula(self, formula:str, alias:dict, ridx:int = 0): + """ + Evaluate the value of formula. + + @param formula: the formula to be evaluated + @param alias: the dict has alias to metric name mapping + @returns: value of the formula is success; -1 if the one or more metric value not provided + """ + stack = [] + b = 0 + errs = [] + sign = "+" + f = str() + + #TODO: support parenthesis? + for i in range(len(formula)): + if i+1 == len(formula) or formula[i] in ('+', '-', '*', '/'): + s = alias[formula[b:i]] if i+1 < len(formula) else alias[formula[b:]] + v = self.get_value(s, ridx) + if not v: + errs.append(s) + else: + f = f + "{0}(={1:.4f})".format(s, v[0]) + if sign == "*": + stack[-1] = stack[-1] * v + elif sign == "/": + stack[-1] = stack[-1] / v + elif sign == '-': + stack.append(-v[0]) + else: + stack.append(v[0]) + if i + 1 < len(formula): + sign = formula[i] + f += sign + b = i + 1 + + if len(errs) > 0: + return -1, "Metric value missing: "+','.join(errs) + + val = sum(stack) + return val, f + + # Relationships Tests + def relationship_test(self, rule: dict): + """ + Validate if the metrics follow the required relationship in the rule. + eg. lower_bound <= eval(formula)<= upper_bound + One rule is counted as ont test. + Failure: when one or more metric result(s) not provided, or when formula evaluated outside of upper/lower bounds. + + @param rule: dict with metric name(+alias), formula, and required upper and lower bounds. + """ + alias = dict() + for m in rule['Metrics']: + alias[m['Alias']] = m['Name'] + lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex']) + val, f = self.evaluate_formula(rule['Formula'], alias, ridx=rule['RuleIndex']) + if val == -1: + self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Description':f}) + elif not self.check_bound(val, lbv, ubv, t): + lb = rule['RangeLower'] + ub = rule['RangeUpper'] + if isinstance(lb, str): + if lb in alias: + lb = alias[lb] + if isinstance(ub, str): + if ub in alias: + ub = alias[ub] + self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Formula':f, + 'RangeLower': lb, 'LowerBoundValue': self.get_value(lb), + 'RangeUpper': ub, 'UpperBoundValue':self.get_value(ub), + 'ErrorThreshold': t, 'CollectedValue': val}) + else: + self.passedcnt += 1 + self.failtests['RelationshipTest']['Passed Tests'] += 1 + self.totalcnt += 1 + self.failtests['RelationshipTest']['Total Tests'] += 1 + + return + + + # Single Metric Test + def single_test(self, rule:dict): + """ + Validate if the metrics are in the required value range. + eg. lower_bound <= metrics_value <= upper_bound + One metric is counted as one test in this type of test. + One rule may include one or more metrics. + Failure: when the metric value not provided or the value is outside the bounds. + This test updates self.total_cnt and records failed tests in self.failtest['SingleMetricTest']. + + @param rule: dict with metrics to validate and the value range requirement + """ + lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold']) + metrics = rule['Metrics'] + passcnt = 0 + totalcnt = 0 + faillist = [] + for m in metrics: + totalcnt += 1 + result = self.get_value(m['Name']) + if len(result) > 0 and self.check_bound(result[0], lbv, ubv, t): + passcnt += 1 + else: + faillist.append({'MetricName':m['Name'], 'CollectedValue':result}) + + self.totalcnt += totalcnt + self.passedcnt += passcnt + self.failtests['SingleMetricTest']['Total Tests'] += totalcnt + self.failtests['SingleMetricTest']['Passed Tests'] += passcnt + if len(faillist) != 0: + self.failtests['SingleMetricTest']['Failed Tests'].append({'RuleIndex':rule['RuleIndex'], + 'RangeLower': rule['RangeLower'], + 'RangeUpper': rule['RangeUpper'], + 'ErrorThreshold':rule['ErrorThreshold'], + 'Failure':faillist}) + + return + + def create_report(self): + """ + Create final report and write into a JSON file. + """ + alldata = list() + for i in range(0, len(self.workloads)): + reportstas = {"Total Rule Count": self.alltotalcnt[i], "Passed Rule Count": self.allpassedcnt[i]} + data = {"Metric Validation Statistics": reportstas, "Tests in Category": self.allfailtests[i], + "Errors":self.allerrlist[i]} + alldata.append({"Workload": self.workloads[i], "Report": data}) + + json_str = json.dumps(alldata, indent=4) + print("Test validation finished. Final report: ") + print(json_str) + + if self.debug: + allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]} for i in range(0, len(self.workloads))] + self.json_dump(allres, self.datafname) + + def check_rule(self, testtype, metric_list): + """ + Check if the rule uses metric(s) that not exist in current platform. + + @param metric_list: list of metrics from the rule. + @return: False when find one metric out in Metric file. (This rule should not skipped.) + True when all metrics used in the rule are found in Metric file. + """ + if testtype == "RelationshipTest": + for m in metric_list: + if m['Name'] not in self.metrics: + return False + return True + + # Start of Collector and Converter + def convert(self, data: list, idx: int): + """ + Convert collected metric data from the -j output to dict of {metric_name:value}. + """ + for json_string in data: + try: + result =json.loads(json_string) + if "metric-unit" in result and result["metric-unit"] != "(null)" and result["metric-unit"] != "": + name = result["metric-unit"].split(" ")[1] if len(result["metric-unit"].split(" ")) > 1 \ + else result["metric-unit"] + if idx not in self.results: self.results[idx] = dict() + self.results[idx][name.lower()] = float(result["metric-value"]) + except ValueError as error: + continue + return + + def collect_perf(self, data_file: str, workload: str): + """ + Collect metric data with "perf stat -M" on given workload with -a and -j. + """ + self.results = dict() + tool = 'perf' + print(f"Starting perf collection") + print(f"Workload: {workload}") + collectlist = dict() + if self.collectlist != "": + collectlist[0] = {x for x in self.collectlist.split(",")} + else: + collectlist[0] = set(list(self.metrics)) + # Create metric set for relationship rules + for rule in self.rules: + if rule["TestType"] == "RelationshipTest": + metrics = [m["Name"] for m in rule["Metrics"]] + if not any(m not in collectlist[0] for m in metrics): + collectlist[rule["RuleIndex"]] = set(metrics) + + for idx, metrics in collectlist.items(): + if idx == 0: wl = "sleep 0.5".split() + else: wl = workload.split() + for metric in metrics: + command = [tool, 'stat', '-j', '-M', f"{metric}", "-a"] + command.extend(wl) + cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8') + data = [x+'}' for x in cmd.stderr.split('}\n') if x] + self.convert(data, idx) + # End of Collector and Converter + + # Start of Rule Generator + def parse_perf_metrics(self): + """ + Read and parse perf metric file: + 1) find metrics with '1%' or '100%' as ScaleUnit for Percent check + 2) create metric name list + """ + command = ['perf', 'list', '-j', '--details', 'metrics'] + cmd = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') + try: + data = json.loads(cmd.stdout) + for m in data: + if 'MetricName' not in m: + print("Warning: no metric name") + continue + name = m['MetricName'] + self.metrics.add(name) + if 'ScaleUnit' in m and (m['ScaleUnit'] == '1%' or m['ScaleUnit'] == '100%'): + self.pctgmetrics.add(name.lower()) + except ValueError as error: + print(f"Error when parsing metric data") + sys.exit() + + return + + def create_rules(self): + """ + Create full rules which includes: + 1) All the rules from the "relationshi_rules" file + 2) SingleMetric rule for all the 'percent' metrics + + Reindex all the rules to avoid repeated RuleIndex + """ + self.rules = self.read_json(self.rulefname)['RelationshipRules'] + pctgrule = {'RuleIndex':0, + 'TestType':'SingleMetricTest', + 'RangeLower':'0', + 'RangeUpper': '100', + 'ErrorThreshold': self.tolerance, + 'Description':'Metrics in percent unit have value with in [0, 100]', + 'Metrics': [{'Name': m} for m in self.pctgmetrics]} + self.rules.append(pctgrule) + + # Re-index all rules to avoid repeated RuleIndex + idx = 1 + for r in self.rules: + r['RuleIndex'] = idx + idx += 1 + + if self.debug: + #TODO: need to test and generate file name correctly + data = {'RelationshipRules':self.rules, 'SupportedMetrics': [{"MetricName": name} for name in self.metrics]} + self.json_dump(data, self.fullrulefname) + + return + # End of Rule Generator + + def _storewldata(self, key): + ''' + Store all the data of one workload into the corresponding data structure for all workloads. + @param key: key to the dictionaries (index of self.workloads). + ''' + self.allresults[key] = self.results + self.allignoremetrics[key] = self.ignoremetrics + self.allfailtests[key] = self.failtests + self.alltotalcnt[key] = self.totalcnt + self.allpassedcnt[key] = self.passedcnt + self.allerrlist[key] = self.errlist + + #Initialize data structures before data validation of each workload + def _init_data(self): + + testtypes = ['PositiveValueTest', 'RelationshipTest', 'SingleMetricTest'] + self.results = dict() + self.ignoremetrics= set() + self.errlist = list() + self.failtests = {k:{'Total Tests':0, 'Passed Tests':0, 'Failed Tests':[]} for k in testtypes} + self.totalcnt = 0 + self.passedcnt = 0 + + def test(self): + ''' + The real entry point of the test framework. + This function loads the validation rule JSON file and Standard Metric file to create rules for + testing and namemap dictionaries. + It also reads in result JSON file for testing. + + In the test process, it passes through each rule and launch correct test function bases on the + 'TestType' field of the rule. + + The final report is written into a JSON file. + ''' + self.parse_perf_metrics() + self.create_rules() + for i in range(0, len(self.workloads)): + self._init_data() + self.collect_perf(self.datafname, self.workloads[i]) + # Run positive value test + self.pos_val_test() + for r in self.rules: + # skip rules that uses metrics not exist in this platform + testtype = r['TestType'] + if not self.check_rule(testtype, r['Metrics']): + continue + if testtype == 'RelationshipTest': + self.relationship_test(r) + elif testtype == 'SingleMetricTest': + self.single_test(r) + else: + print("Unsupported Test Type: ", testtype) + self.errlist.append("Unsupported Test Type from rule: " + r['RuleIndex']) + self._storewldata(i) + print("Workload: ", self.workloads[i]) + print("Total metrics collected: ", self.failtests['PositiveValueTest']['Total Tests']) + print("Non-negative metric count: ", self.failtests['PositiveValueTest']['Passed Tests']) + print("Total Test Count: ", self.totalcnt) + print("Passed Test Count: ", self.passedcnt) + + self.create_report() + return sum(self.alltotalcnt.values()) != sum(self.allpassedcnt.values()) +# End of Class Validator + + +def main() -> None: + parser = argparse.ArgumentParser(description="Launch metric value validation") + + parser.add_argument("-rule", help="Base validation rule file", required=True) + parser.add_argument("-output_dir", help="Path for validator output file, report file", required=True) + parser.add_argument("-debug", help="Debug run, save intermediate data to files", action="store_true", default=False) + parser.add_argument("-wl", help="Workload to run while data collection", default="true") + parser.add_argument("-m", help="Metric list to validate", default="") + args = parser.parse_args() + outpath = Path(args.output_dir) + reportf = Path.joinpath(outpath, 'perf_report.json') + fullrule = Path.joinpath(outpath, 'full_rule.json') + datafile = Path.joinpath(outpath, 'perf_data.json') + + validator = Validator(args.rule, reportf, debug=args.debug, + datafname=datafile, fullrulefname=fullrule, workload=args.wl, + metrics=args.m) + ret = validator.test() + + return ret + + +if __name__ == "__main__": + import sys + sys.exit(main()) + + + diff --git a/tools/perf/tests/shell/lib/perf_metric_validation_rules.json b/tools/perf/tests/shell/lib/perf_metric_validation_rules.json new file mode 100644 index 000000000000..debaa910da9f --- /dev/null +++ b/tools/perf/tests/shell/lib/perf_metric_validation_rules.json @@ -0,0 +1,387 @@ +{ + "RelationshipRules": [ + { + "RuleIndex": 1, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "c", + "RangeUpper": "c", + "ErrorThreshold": 5.0, + "Description": "Intel(R) Optane(TM) Persistent Memory(PMEM) bandwidth total includes Intel(R) Optane(TM) Persistent Memory(PMEM) read bandwidth and Intel(R) Optane(TM) Persistent Memory(PMEM) write bandwidth", + "Metrics": [ + { + "Name": "pmem_memory_bandwidth_read", + "Alias": "a" + }, + { + "Name": "pmem_memory_bandwidth_write", + "Alias": "b" + }, + { + "Name": "pmem_memory_bandwidth_total", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 2, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "c", + "RangeUpper": "c", + "ErrorThreshold": 5.0, + "Description": "DDR memory bandwidth total includes DDR memory read bandwidth and DDR memory write bandwidth", + "Metrics": [ + { + "Name": "memory_bandwidth_read", + "Alias": "a" + }, + { + "Name": "memory_bandwidth_write", + "Alias": "b" + }, + { + "Name": "memory_bandwidth_total", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 3, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "100", + "RangeUpper": "100", + "ErrorThreshold": 5.0, + "Description": "Total memory read accesses includes memory reads from last level cache (LLC) addressed to local DRAM and memory reads from the last level cache (LLC) addressed to remote DRAM.", + "Metrics": [ + { + "Name": "numa_reads_addressed_to_local_dram", + "Alias": "a" + }, + { + "Name": "numa_reads_addressed_to_remote_dram", + "Alias": "b" + } + ] + }, + { + "RuleIndex": 4, + "Formula": "a", + "TestType": "SingleMetricTest", + "RangeLower": "0.125", + "RangeUpper": "", + "ErrorThreshold": "", + "Description": "", + "Metrics": [ + { + "Name": "cpi", + "Alias": "a" + } + ] + }, + { + "RuleIndex": 5, + "Formula": "", + "TestType": "SingleMetricTest", + "RangeLower": "0", + "RangeUpper": "1", + "ErrorThreshold": 5.0, + "Description": "Ratio values should be within value range [0,1)", + "Metrics": [ + { + "Name": "loads_per_instr", + "Alias": "" + }, + { + "Name": "stores_per_instr", + "Alias": "" + }, + { + "Name": "l1d_mpi", + "Alias": "" + }, + { + "Name": "l1d_demand_data_read_hits_per_instr", + "Alias": "" + }, + { + "Name": "l1_i_code_read_misses_with_prefetches_per_instr", + "Alias": "" + }, + { + "Name": "l2_demand_data_read_hits_per_instr", + "Alias": "" + }, + { + "Name": "l2_mpi", + "Alias": "" + }, + { + "Name": "l2_demand_data_read_mpi", + "Alias": "" + }, + { + "Name": "l2_demand_code_mpi", + "Alias": "" + } + ] + }, + { + "RuleIndex": 6, + "Formula": "a+b+c+d", + "TestType": "RelationshipTest", + "RangeLower": "100", + "RangeUpper": "100", + "ErrorThreshold": 5.0, + "Description": "Sum of TMA level 1 metrics should be 100%", + "Metrics": [ + { + "Name": "tma_frontend_bound", + "Alias": "a" + }, + { + "Name": "tma_bad_speculation", + "Alias": "b" + }, + { + "Name": "tma_backend_bound", + "Alias": "c" + }, + { + "Name": "tma_retiring", + "Alias": "d" + } + ] + }, + { + "RuleIndex": 7, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "c", + "RangeUpper": "c", + "ErrorThreshold": 5.0, + "Description": "Sum of the level 2 children should equal level 1 parent", + "Metrics": [ + { + "Name": "tma_fetch_latency", + "Alias": "a" + }, + { + "Name": "tma_fetch_bandwidth", + "Alias": "b" + }, + { + "Name": "tma_frontend_bound", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 8, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "c", + "RangeUpper": "c", + "ErrorThreshold": 5.0, + "Description": "Sum of the level 2 children should equal level 1 parent", + "Metrics": [ + { + "Name": "tma_branch_mispredicts", + "Alias": "a" + }, + { + "Name": "tma_machine_clears", + "Alias": "b" + }, + { + "Name": "tma_bad_speculation", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 9, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "c", + "RangeUpper": "c", + "ErrorThreshold": 5.0, + "Description": "Sum of the level 2 children should equal level 1 parent", + "Metrics": [ + { + "Name": "tma_memory_bound", + "Alias": "a" + }, + { + "Name": "tma_core_bound", + "Alias": "b" + }, + { + "Name": "tma_backend_bound", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 10, + "Formula": "a+b", + "TestType": "RelationshipTest", + "RangeLower": "c", + "RangeUpper": "c", + "ErrorThreshold": 5.0, + "Description": "Sum of the level 2 children should equal level 1 parent", + "Metrics": [ + { + "Name": "tma_light_operations", + "Alias": "a" + }, + { + "Name": "tma_heavy_operations", + "Alias": "b" + }, + { + "Name": "tma_retiring", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 11, + "Formula": "a+b+c", + "TestType": "RelationshipTest", + "RangeLower": "100", + "RangeUpper": "100", + "ErrorThreshold": 5.0, + "Description": "The all_requests includes the memory_page_empty, memory_page_misses, and memory_page_hits equals.", + "Metrics": [ + { + "Name": "memory_page_empty_vs_all_requests", + "Alias": "a" + }, + { + "Name": "memory_page_misses_vs_all_requests", + "Alias": "b" + }, + { + "Name": "memory_page_hits_vs_all_requests", + "Alias": "c" + } + ] + }, + { + "RuleIndex": 12, + "Formula": "a-b", + "TestType": "RelationshipTest", + "RangeLower": "0", + "RangeUpper": "", + "ErrorThreshold": 5.0, + "Description": "CPU utilization in kernel mode should always be <= cpu utilization", + "Metrics": [ + { + "Name": "cpu_utilization", + "Alias": "a" + }, + { + "Name": "cpu_utilization_in_kernel_mode", + "Alias": "b" + } + ] + }, + { + "RuleIndex": 13, + "Formula": "a-b", + "TestType": "RelationshipTest", + "RangeLower": "0", + "RangeUpper": "", + "ErrorThreshold": 5.0, + "Description": "Total L2 misses per instruction should be >= L2 demand data read misses per instruction", + "Metrics": [ + { + "Name": "l2_mpi", + "Alias": "a" + }, + { + "Name": "l2_demand_data_read_mpi", + "Alias": "b" + } + ] + }, + { + "RuleIndex": 14, + "Formula": "a-b", + "TestType": "RelationshipTest", + "RangeLower": "0", + "RangeUpper": "", + "ErrorThreshold": 5.0, + "Description": "Total L2 misses per instruction should be >= L2 demand code misses per instruction", + "Metrics": [ + { + "Name": "l2_mpi", + "Alias": "a" + }, + { + "Name": "l2_demand_code_mpi", + "Alias": "b" + } + ] + }, + { + "RuleIndex": 15, + "Formula": "b+c+d", + "TestType": "RelationshipTest", + "RangeLower": "a", + "RangeUpper": "a", + "ErrorThreshold": 5.0, + "Description": "L3 data read, rfo, code misses per instruction equals total L3 misses per instruction.", + "Metrics": [ + { + "Name": "llc_mpi", + "Alias": "a" + }, + { + "Name": "llc_data_read_mpi_demand_plus_prefetch", + "Alias": "b" + }, + { + "Name": "llc_rfo_read_mpi_demand_plus_prefetch", + "Alias": "c" + }, + { + "Name": "llc_code_read_mpi_demand_plus_prefetch", + "Alias": "d" + } + ] + }, + { + "RuleIndex": 16, + "Formula": "a", + "TestType": "SingleMetricTest", + "RangeLower": "0", + "RangeUpper": "8", + "ErrorThreshold": 0.0, + "Description": "Setting generous range for allowable frequencies", + "Metrics": [ + { + "Name": "uncore_freq", + "Alias": "a" + } + ] + }, + { + "RuleIndex": 17, + "Formula": "a", + "TestType": "SingleMetricTest", + "RangeLower": "0", + "RangeUpper": "8", + "ErrorThreshold": 0.0, + "Description": "Setting generous range for allowable frequencies", + "Metrics": [ + { + "Name": "cpu_operating_frequency", + "Alias": "a" + } + ] + } + ] +} \ No newline at end of file diff --git a/tools/perf/tests/shell/stat_metrics_values.sh b/tools/perf/tests/shell/stat_metrics_values.sh new file mode 100755 index 000000000000..ad94c936de7e --- /dev/null +++ b/tools/perf/tests/shell/stat_metrics_values.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# perf metrics value validation +# SPDX-License-Identifier: GPL-2.0 +if [ "x$PYTHON" == "x" ] +then + if which python3 > /dev/null + then + PYTHON=python3 + else + echo Skipping test, python3 not detected please set environment variable PYTHON. + exit 2 + fi +fi + +grep -q GenuineIntel /proc/cpuinfo || { echo Skipping non-Intel; exit 2; } + +pythonvalidator=$(dirname $0)/lib/perf_metric_validation.py +rulefile=$(dirname $0)/lib/perf_metric_validation_rules.json +tmpdir=$(mktemp -d /tmp/__perf_test.program.XXXXX) +workload="perf bench futex hash -r 2 -s" + +# Add -debug, save data file and full rule file +echo "Launch python validation script $pythonvalidator" +echo "Output will be stored in: $tmpdir" +$PYTHON $pythonvalidator -rule $rulefile -output_dir $tmpdir -wl "${workload}" +ret=$? +rm -rf $tmpdir + +exit $ret + -- cgit v1.2.3 From a0f1cc18f91faf75a321135ac08385a4f260a87d Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Tue, 20 Jun 2023 10:00:26 -0700 Subject: perf test: Add skip list for metrics known would fail Add skip list for metrics known would fail because some of the metrics are very likely to fail due to multiplexing or other errors. So add all of the flaky tests into the skip list. Signed-off-by: Weilin Wang Tested-by: Namhyung Kim Cc: ravi.bangoria@amd.com Cc: Ian Rogers Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Caleb Biggers Cc: Perry Taylor Cc: Samantha Alt Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230620170027.1861012-3-weilin.wang@intel.com Signed-off-by: Namhyung Kim --- .../perf/tests/shell/lib/perf_metric_validation.py | 31 +++++++++++++++++++--- .../shell/lib/perf_metric_validation_rules.json | 11 ++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py index 81bd2bf38b67..3c3a9b4f8b82 100644 --- a/tools/perf/tests/shell/lib/perf_metric_validation.py +++ b/tools/perf/tests/shell/lib/perf_metric_validation.py @@ -12,7 +12,7 @@ class Validator: self.reportfname = reportfname self.rules = None self.collectlist=metrics - self.metrics = set() + self.metrics = set(metrics) self.tolerance = t self.workloads = [x for x in workload.split(",") if x] @@ -148,6 +148,7 @@ class Validator: self.errlist.append("Metric '%s' is not collected"%(name)) elif val < 0: negmetric.add("{0}(={1:.4f})".format(name, val)) + self.collectlist[0].append(name) else: pcnt += 1 tcnt += 1 @@ -266,6 +267,7 @@ class Validator: passcnt += 1 else: faillist.append({'MetricName':m['Name'], 'CollectedValue':result}) + self.collectlist[0].append(m['Name']) self.totalcnt += totalcnt self.passedcnt += passcnt @@ -348,7 +350,7 @@ class Validator: if rule["TestType"] == "RelationshipTest": metrics = [m["Name"] for m in rule["Metrics"]] if not any(m not in collectlist[0] for m in metrics): - collectlist[rule["RuleIndex"]] = set(metrics) + collectlist[rule["RuleIndex"]] = [",".join(list(set(metrics)))] for idx, metrics in collectlist.items(): if idx == 0: wl = "sleep 0.5".split() @@ -356,9 +358,12 @@ class Validator: for metric in metrics: command = [tool, 'stat', '-j', '-M', f"{metric}", "-a"] command.extend(wl) + print(" ".join(command)) cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8') data = [x+'}' for x in cmd.stderr.split('}\n') if x] self.convert(data, idx) + self.collectlist = dict() + self.collectlist[0] = list() # End of Collector and Converter # Start of Rule Generator @@ -386,6 +391,20 @@ class Validator: return + def remove_unsupported_rules(self, rules, skiplist: set = None): + for m in skiplist: + self.metrics.discard(m) + new_rules = [] + for rule in rules: + add_rule = True + for m in rule["Metrics"]: + if m["Name"] not in self.metrics: + add_rule = False + break + if add_rule: + new_rules.append(rule) + return new_rules + def create_rules(self): """ Create full rules which includes: @@ -394,7 +413,10 @@ class Validator: Reindex all the rules to avoid repeated RuleIndex """ - self.rules = self.read_json(self.rulefname)['RelationshipRules'] + data = self.read_json(self.rulefname) + rules = data['RelationshipRules'] + skiplist = set(data['SkipList']) + self.rules = self.remove_unsupported_rules(rules, skiplist) pctgrule = {'RuleIndex':0, 'TestType':'SingleMetricTest', 'RangeLower':'0', @@ -453,7 +475,8 @@ class Validator: The final report is written into a JSON file. ''' - self.parse_perf_metrics() + if not self.collectlist: + self.parse_perf_metrics() self.create_rules() for i in range(0, len(self.workloads)): self._init_data() diff --git a/tools/perf/tests/shell/lib/perf_metric_validation_rules.json b/tools/perf/tests/shell/lib/perf_metric_validation_rules.json index debaa910da9f..eb6f59e018b7 100644 --- a/tools/perf/tests/shell/lib/perf_metric_validation_rules.json +++ b/tools/perf/tests/shell/lib/perf_metric_validation_rules.json @@ -1,4 +1,15 @@ { + "SkipList": [ + "tsx_aborted_cycles", + "tsx_transactional_cycles", + "C2_Pkg_Residency", + "C6_Pkg_Residency", + "C1_Core_Residency", + "C6_Core_Residency", + "tma_false_sharing", + "tma_remote_cache", + "tma_contested_accesses" + ], "RelationshipRules": [ { "RuleIndex": 1, -- cgit v1.2.3 From 1203a63da0461d0081ea6e3d5e52893985bfed42 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Tue, 20 Jun 2023 10:00:27 -0700 Subject: perf test: Rerun failed metrics with longer workload Rerun failed metrics with longer workload to avoid false failure because sometimes metric value test fails when running in very short amount of time. Skip rerun if equal to or more than 20 metrics fail. Signed-off-by: Weilin Wang Tested-by: Namhyung Kim Cc: ravi.bangoria@amd.com Cc: Ian Rogers Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Caleb Biggers Cc: Perry Taylor Cc: Samantha Alt Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230620170027.1861012-4-weilin.wang@intel.com Signed-off-by: Namhyung Kim --- .../perf/tests/shell/lib/perf_metric_validation.py | 129 +++++++++++++-------- 1 file changed, 83 insertions(+), 46 deletions(-) diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py index 3c3a9b4f8b82..50a34a9cc040 100644 --- a/tools/perf/tests/shell/lib/perf_metric_validation.py +++ b/tools/perf/tests/shell/lib/perf_metric_validation.py @@ -11,8 +11,9 @@ class Validator: self.rulefname = rulefname self.reportfname = reportfname self.rules = None - self.collectlist=metrics - self.metrics = set(metrics) + self.collectlist:str = metrics + self.metrics = self.__set_metrics(metrics) + self.skiplist = set() self.tolerance = t self.workloads = [x for x in workload.split(",") if x] @@ -41,6 +42,12 @@ class Validator: self.debug = debug self.fullrulefname = fullrulefname + def __set_metrics(self, metrics=''): + if metrics != '': + return set(metrics.split(",")) + else: + return set() + def read_json(self, filename: str) -> dict: try: with open(Path(filename).resolve(), "r") as f: @@ -113,7 +120,7 @@ class Validator: All future test(s) on this metric will fail. @param name: name of the metric - @returns: list with value found in self.results; list is empty when not value found. + @returns: list with value found in self.results; list is empty when value is not found. """ results = [] data = self.results[ridx] if ridx in self.results else self.results[0] @@ -123,7 +130,6 @@ class Validator: elif name.replace('.', '1').isdigit(): results.append(float(name)) else: - self.errlist.append("Metric '%s' is not collected or the value format is incorrect"%(name)) self.ignoremetrics.add(name) return results @@ -138,27 +144,32 @@ class Validator: Failure: when metric value is negative or not provided. Metrics with negative value will be added into the self.failtests['PositiveValueTest'] and self.ignoremetrics. """ - negmetric = set() - missmetric = set() + negmetric = dict() pcnt = 0 tcnt = 0 + rerun = list() for name, val in self.get_results().items(): - if val is None or val == '': - missmetric.add(name) - self.errlist.append("Metric '%s' is not collected"%(name)) - elif val < 0: - negmetric.add("{0}(={1:.4f})".format(name, val)) - self.collectlist[0].append(name) + if val < 0: + negmetric[name] = val + rerun.append(name) else: pcnt += 1 tcnt += 1 + if len(rerun) > 0 and len(rerun) < 20: + second_results = dict() + self.second_test(rerun, second_results) + for name, val in second_results.items(): + if name not in negmetric: continue + if val >= 0: + del negmetric[name] + pcnt += 1 self.failtests['PositiveValueTest']['Total Tests'] = tcnt self.failtests['PositiveValueTest']['Passed Tests'] = pcnt - if len(negmetric) or len(missmetric)> 0: - self.ignoremetrics.update(negmetric) - self.ignoremetrics.update(missmetric) - self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue':list(negmetric), 'MissingValue':list(missmetric)}) + if len(negmetric.keys()): + self.ignoremetrics.update(negmetric.keys()) + negmessage = ["{0}(={1:.4f})".format(name, val) for name, val in negmetric.items()] + self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue': negmessage}) return @@ -259,21 +270,36 @@ class Validator: metrics = rule['Metrics'] passcnt = 0 totalcnt = 0 - faillist = [] + faillist = list() + failures = dict() + rerun = list() for m in metrics: totalcnt += 1 result = self.get_value(m['Name']) - if len(result) > 0 and self.check_bound(result[0], lbv, ubv, t): + if len(result) > 0 and self.check_bound(result[0], lbv, ubv, t) or m['Name'] in self.skiplist: passcnt += 1 else: - faillist.append({'MetricName':m['Name'], 'CollectedValue':result}) - self.collectlist[0].append(m['Name']) + failures[m['Name']] = result + rerun.append(m['Name']) + + if len(rerun) > 0 and len(rerun) < 20: + second_results = dict() + self.second_test(rerun, second_results) + for name, val in second_results.items(): + if name not in failures: continue + if self.check_bound(val, lbv, ubv, t): + passcnt += 1 + del failures[name] + else: + failures[name] = val + self.results[0][name] = val self.totalcnt += totalcnt self.passedcnt += passcnt self.failtests['SingleMetricTest']['Total Tests'] += totalcnt self.failtests['SingleMetricTest']['Passed Tests'] += passcnt - if len(faillist) != 0: + if len(failures.keys()) != 0: + faillist = [{'MetricName':name, 'CollectedValue':val} for name, val in failures.items()] self.failtests['SingleMetricTest']['Failed Tests'].append({'RuleIndex':rule['RuleIndex'], 'RangeLower': rule['RangeLower'], 'RangeUpper': rule['RangeUpper'], @@ -316,7 +342,7 @@ class Validator: return True # Start of Collector and Converter - def convert(self, data: list, idx: int): + def convert(self, data: list, metricvalues:dict): """ Convert collected metric data from the -j output to dict of {metric_name:value}. """ @@ -326,20 +352,29 @@ class Validator: if "metric-unit" in result and result["metric-unit"] != "(null)" and result["metric-unit"] != "": name = result["metric-unit"].split(" ")[1] if len(result["metric-unit"].split(" ")) > 1 \ else result["metric-unit"] - if idx not in self.results: self.results[idx] = dict() - self.results[idx][name.lower()] = float(result["metric-value"]) + metricvalues[name.lower()] = float(result["metric-value"]) except ValueError as error: continue return - def collect_perf(self, data_file: str, workload: str): + def _run_perf(self, metric, workload: str): + tool = 'perf' + command = [tool, 'stat', '-j', '-M', f"{metric}", "-a"] + wl = workload.split() + command.extend(wl) + print(" ".join(command)) + cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8') + data = [x+'}' for x in cmd.stderr.split('}\n') if x] + return data + + + def collect_perf(self, workload: str): """ Collect metric data with "perf stat -M" on given workload with -a and -j. """ self.results = dict() - tool = 'perf' print(f"Starting perf collection") - print(f"Workload: {workload}") + print(f"Long workload: {workload}") collectlist = dict() if self.collectlist != "": collectlist[0] = {x for x in self.collectlist.split(",")} @@ -353,17 +388,20 @@ class Validator: collectlist[rule["RuleIndex"]] = [",".join(list(set(metrics)))] for idx, metrics in collectlist.items(): - if idx == 0: wl = "sleep 0.5".split() - else: wl = workload.split() + if idx == 0: wl = "true" + else: wl = workload for metric in metrics: - command = [tool, 'stat', '-j', '-M', f"{metric}", "-a"] - command.extend(wl) - print(" ".join(command)) - cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8') - data = [x+'}' for x in cmd.stderr.split('}\n') if x] - self.convert(data, idx) - self.collectlist = dict() - self.collectlist[0] = list() + data = self._run_perf(metric, wl) + if idx not in self.results: self.results[idx] = dict() + self.convert(data, self.results[idx]) + return + + def second_test(self, collectlist, second_results): + workload = self.workloads[self.wlidx] + for metric in collectlist: + data = self._run_perf(metric, workload) + self.convert(data, second_results) + # End of Collector and Converter # Start of Rule Generator @@ -381,7 +419,7 @@ class Validator: if 'MetricName' not in m: print("Warning: no metric name") continue - name = m['MetricName'] + name = m['MetricName'].lower() self.metrics.add(name) if 'ScaleUnit' in m and (m['ScaleUnit'] == '1%' or m['ScaleUnit'] == '100%'): self.pctgmetrics.add(name.lower()) @@ -391,14 +429,12 @@ class Validator: return - def remove_unsupported_rules(self, rules, skiplist: set = None): - for m in skiplist: - self.metrics.discard(m) + def remove_unsupported_rules(self, rules): new_rules = [] for rule in rules: add_rule = True for m in rule["Metrics"]: - if m["Name"] not in self.metrics: + if m["Name"] in self.skiplist or m["Name"] not in self.metrics: add_rule = False break if add_rule: @@ -415,15 +451,15 @@ class Validator: """ data = self.read_json(self.rulefname) rules = data['RelationshipRules'] - skiplist = set(data['SkipList']) - self.rules = self.remove_unsupported_rules(rules, skiplist) + self.skiplist = set([name.lower() for name in data['SkipList']]) + self.rules = self.remove_unsupported_rules(rules) pctgrule = {'RuleIndex':0, 'TestType':'SingleMetricTest', 'RangeLower':'0', 'RangeUpper': '100', 'ErrorThreshold': self.tolerance, 'Description':'Metrics in percent unit have value with in [0, 100]', - 'Metrics': [{'Name': m} for m in self.pctgmetrics]} + 'Metrics': [{'Name': m.lower()} for m in self.pctgmetrics]} self.rules.append(pctgrule) # Re-index all rules to avoid repeated RuleIndex @@ -479,8 +515,9 @@ class Validator: self.parse_perf_metrics() self.create_rules() for i in range(0, len(self.workloads)): + self.wlidx = i self._init_data() - self.collect_perf(self.datafname, self.workloads[i]) + self.collect_perf(self.workloads[i]) # Run positive value test self.pos_val_test() for r in self.rules: -- cgit v1.2.3 From d7c2d34d72bfeffca4983c4dcba55d1dd31012be Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Jun 2023 22:58:32 -0700 Subject: perf test: Remove x permission from lib/stat_output.sh The commit fc51fc87b1b8 factored out the helper functions to a library but the new file had execute permission. Due to the way it detects the shell test scripts, it showed up in the perf test list unexpectedly. $ ./perf test list 2>&1 | grep 86 76: x86 bp modify 77: x86 Sample parsing 78: x86 hybrid 86: <---- (here) $ ./perf test -v 86 86: : --- start --- test child forked, pid 1932207 test child finished with 0 ---- end ---- : Ok As it's a collection of library functions, it should not run as is. Let's remove the execute permission. Fixes: fc51fc87b1b8 ("perf test: Move all the check functions of stat CSV output to lib") Acked-by: Ian Rogers Cc: Kan Liang Cc: Andi Kleen Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230622055832.83476-1-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/tests/shell/lib/stat_output.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 tools/perf/tests/shell/lib/stat_output.sh diff --git a/tools/perf/tests/shell/lib/stat_output.sh b/tools/perf/tests/shell/lib/stat_output.sh old mode 100755 new mode 100644 -- cgit v1.2.3 From 765be32b97fe69f67164cc7772a74c6a10562e0b Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 23 May 2023 17:57:53 +0800 Subject: perf symbol: Add LoongArch case in get_plt_sizes() We can see the following definitions in bfd/elfnn-loongarch.c: #define PLT_HEADER_INSNS 8 #define PLT_HEADER_SIZE (PLT_HEADER_INSNS * 4) #define PLT_ENTRY_INSNS 4 #define PLT_ENTRY_SIZE (PLT_ENTRY_INSNS * 4) so plt header size is 32 and plt entry size is 16 on LoongArch, let us add LoongArch case in get_plt_sizes(). Signed-off-by: Tiezhu Yang Acked-by: Huacai Chen Reviewed-by: Leo Yan Cc: Mark Rutland Cc: Ian Rogers Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: loongarch@lists.linux.dev Cc: loongson-kernel@lists.loongnix.cn Cc: Ingo Molnar Link: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=bfd/elfnn-loongarch.c Link: https://lore.kernel.org/r/1684835873-15956-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Namhyung Kim --- tools/perf/util/symbol-elf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index e6493d1cc251..8bd466d1c2bd 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -42,6 +42,10 @@ #define EM_AARCH64 183 /* ARM 64 bit */ #endif +#ifndef EM_LOONGARCH +#define EM_LOONGARCH 258 +#endif + #ifndef ELF32_ST_VISIBILITY #define ELF32_ST_VISIBILITY(o) ((o) & 0x03) #endif @@ -438,6 +442,10 @@ static bool get_plt_sizes(struct dso *dso, GElf_Ehdr *ehdr, GElf_Shdr *shdr_plt, *plt_header_size = 32; *plt_entry_size = 16; return true; + case EM_LOONGARCH: + *plt_header_size = 32; + *plt_entry_size = 16; + return true; case EM_SPARC: *plt_header_size = 48; *plt_entry_size = 12; -- cgit v1.2.3 From 33fe7c08446af6dda0ff08ff4fa9c921e574477f Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 22 Jun 2023 11:18:09 +0100 Subject: perf tests: Fix test_arm_callgraph_fp variable expansion $TEST_PROGRAM is a command with spaces so it's supposed to be word split. The referenced fix to fix the shellcheck warnings incorrectly quoted this string so unquote it to fix the test. At the same time silence the shellcheck warning for that line and fix two more shellcheck errors at the end of the script. Fixes: 1bb17b4c6c91 ("perf tests arm_callgraph_fp: Address shellcheck warnings about signal names and adding double quotes for expression") Signed-off-by: James Clark Acked-by: Namhyung Kim Cc: Mark Rutland Cc: Ian Rogers Cc: spoorts2@in.ibm.com Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kajol Jain Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230622101809.2431897-1-james.clark@arm.com Signed-off-by: Namhyung Kim --- tools/perf/tests/shell/test_arm_callgraph_fp.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh index 1380e0d12dce..66dfdfdad553 100755 --- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh +++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh @@ -15,7 +15,8 @@ cleanup_files() trap cleanup_files EXIT TERM INT # Add a 1 second delay to skip samples that are not in the leaf() function -perf record -o "$PERF_DATA" --call-graph fp -e cycles//u -D 1000 --user-callchains -- "$TEST_PROGRAM" 2> /dev/null & +# shellcheck disable=SC2086 +perf record -o "$PERF_DATA" --call-graph fp -e cycles//u -D 1000 --user-callchains -- $TEST_PROGRAM 2> /dev/null & PID=$! echo " + Recording (PID=$PID)..." @@ -33,8 +34,8 @@ wait $PID # 76c leafloop # ... -perf script -i $PERF_DATA -F comm,ip,sym | head -n4 -perf script -i $PERF_DATA -F comm,ip,sym | head -n4 | \ +perf script -i "$PERF_DATA" -F comm,ip,sym | head -n4 +perf script -i "$PERF_DATA" -F comm,ip,sym | head -n4 | \ awk '{ if ($2 != "") sym[i++] = $2 } END { if (sym[0] != "leaf" || sym[1] != "parent" || sym[2] != "leafloop") exit 1 }' -- cgit v1.2.3 From 2d7f5540b8696b855adf4121ce4a9bf77938848f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 22 Jun 2023 16:53:56 -0700 Subject: perf script: Initialize buffer for regs_map() The buffer is used to save register mapping in a sample. Normally perf samples don't have any register so the string should be empty. But it missed to initialize the buffer when the size is 0. And it's passed to PyUnicode_FromString() with a garbage data. So it returns NULL due to invalid input (instead of an empty unicode string object) which causes a segfault like below: Thread 2.1 "perf" received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7ffff7c83780 (LWP 193775)] 0x00007ffff6dbca2e in PyDict_SetItem () from /lib/x86_64-linux-gnu/libpython3.11.so.1.0 (gdb) bt #0 0x00007ffff6dbca2e in PyDict_SetItem () from /lib/x86_64-linux-gnu/libpython3.11.so.1.0 #1 0x00007ffff6dbf848 in PyDict_SetItemString () from /lib/x86_64-linux-gnu/libpython3.11.so.1.0 #2 0x000055555575824d in pydict_set_item_string_decref (val=0x0, key=0x5555557f96e3 "iregs", dict=0x7ffff5f7f780) at util/scripting-engines/trace-event-python.c:145 #3 set_regs_in_dict (evsel=0x555555efc370, sample=0x7fffffffb870, dict=0x7ffff5f7f780) at util/scripting-engines/trace-event-python.c:776 #4 get_perf_sample_dict (sample=sample@entry=0x7fffffffb870, evsel=evsel@entry=0x555555efc370, al=al@entry=0x7fffffffb2e0, addr_al=addr_al@entry=0x0, callchain=callchain@entry=0x7ffff63ef440) at util/scripting-engines/trace-event-python.c:923 #5 0x0000555555758ec1 in python_process_tracepoint (sample=0x7fffffffb870, evsel=0x555555efc370, al=0x7fffffffb2e0, addr_al=0x0) at util/scripting-engines/trace-event-python.c:1044 #6 0x00005555555c5db8 in process_sample_event (tool=, event=, sample=, evsel=0x555555efc370, machine=0x555555ef4d68) at builtin-script.c:2421 #7 0x00005555556b7793 in perf_session__deliver_event (session=0x555555ef4b60, event=0x7ffff62ff7d0, tool=0x7fffffffc150, file_offset=30672, file_path=0x555555efb8a0 "perf.data") at util/session.c:1639 #8 0x00005555556bc864 in do_flush (show_progress=true, oe=0x555555efb700) at util/ordered-events.c:245 #9 __ordered_events__flush (oe=oe@entry=0x555555efb700, how=how@entry=OE_FLUSH__FINAL, timestamp=timestamp@entry=0) at util/ordered-events.c:324 #10 0x00005555556bd06e in ordered_events__flush (oe=oe@entry=0x555555efb700, how=how@entry=OE_FLUSH__FINAL) at util/ordered-events.c:342 #11 0x00005555556b9d63 in __perf_session__process_events (session=0x555555ef4b60) at util/session.c:2465 #12 perf_session__process_events (session=0x555555ef4b60) at util/session.c:2627 #13 0x00005555555cb1d0 in __cmd_script (script=0x7fffffffc150) at builtin-script.c:2839 #14 cmd_script (argc=, argv=) at builtin-script.c:4365 #15 0x0000555555650811 in run_builtin (p=p@entry=0x555555ed8948 , argc=argc@entry=4, argv=argv@entry=0x7fffffffe240) at perf.c:323 #16 0x0000555555597eb3 in handle_internal_command (argv=0x7fffffffe240, argc=4) at perf.c:377 #17 run_argv (argv=, argcp=) at perf.c:421 #18 main (argc=4, argv=0x7fffffffe240) at perf.c:537 Fixes: 51cfe7a3e87e ("perf python: Avoid 2 leak sanitizer issues") Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Ingo Molnar Acked-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/perf/util/scripting-engines/trace-event-python.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 25fcd6630a4d..94312741443a 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -737,11 +737,11 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch unsigned int i = 0, r; int printed = 0; + bf[0] = 0; + if (size <= 0) return; - bf[0] = 0; - if (!regs || !regs->regs) return; -- cgit v1.2.3 From e4ef3ef1bc0a3d2535427da78b8095ef657eb474 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 22 Jun 2023 16:53:57 -0700 Subject: perf test: Set PERF_EXEC_PATH for script execution The task-analyzer.py script (actually every other scripts too) requires PERF_EXEC_PATH env to find dependent libraries and scripts. For scripts test to run correctly, it needs to set PERF_EXEC_PATH to the perf tool source directory. Instead of blindly update the env, let's check the directory structure to make sure it points to the correct location. Fixes: e8478b84d6ba ("perf test: add new task-analyzer tests") Cc: Petar Gligoric Cc: Hagen Paul Pfeifer Cc: Aditya Gupta Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Ingo Molnar Acked-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/perf/tests/shell/test_task_analyzer.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index 59785dfc11f8..0095abbe20ca 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -5,6 +5,12 @@ tmpdir=$(mktemp -d /tmp/perf-script-task-analyzer-XXXXX) err=0 +# set PERF_EXEC_PATH to find scripts in the source directory +perfdir=$(dirname "$0")/../.. +if [ -e "$perfdir/scripts/python/Perf-Trace-Util" ]; then + export PERF_EXEC_PATH=$perfdir +fi + cleanup() { rm -f perf.data rm -f perf.data.old -- cgit v1.2.3 From 33941dbd14da4eac40a26ac5fd5f84e1842ffc3a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jun 2023 21:31:07 -0700 Subject: perf unwind: Fix map reference counts The result of thread__find_map is the map in the passed in addr_location. Calling addr_location__exit puts that map and so copies need to do a map__get. Add in the corresponding map__puts. v2. Add missing map__put when dso is missing. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Mark Rutland Cc: Ivan Babrou Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230623043107.4077510-1-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/unwind-libunwind-local.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 36bf5100bad2..ebfde537b99b 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -419,7 +419,8 @@ static struct map *find_map(unw_word_t ip, struct unwind_info *ui) struct map *ret; addr_location__init(&al); - ret = thread__find_map(ui->thread, PERF_RECORD_MISC_USER, ip, &al); + thread__find_map(ui->thread, PERF_RECORD_MISC_USER, ip, &al); + ret = map__get(al.map); addr_location__exit(&al); return ret; } @@ -440,8 +441,10 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, return -EINVAL; dso = map__dso(map); - if (!dso) + if (!dso) { + map__put(map); return -EINVAL; + } pr_debug("unwind: find_proc_info dso %s\n", dso->name); @@ -476,11 +479,11 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, memset(&di, 0, sizeof(di)); if (dwarf_find_debug_frame(0, &di, ip, base, symfile, start, map__end(map))) - return dwarf_search_unwind_table(as, ip, &di, pi, - need_unwind_info, arg); + ret = dwarf_search_unwind_table(as, ip, &di, pi, + need_unwind_info, arg); } #endif - + map__put(map); return ret; } @@ -534,12 +537,14 @@ static int access_dso_mem(struct unwind_info *ui, unw_word_t addr, dso = map__dso(map); - if (!dso) + if (!dso) { + map__put(map); return -1; + } size = dso__data_read_addr(dso, map, ui->machine, addr, (u8 *) data, sizeof(*data)); - + map__put(map); return !(size == sizeof(*data)); } -- cgit v1.2.3 From d685819b40affd39d2fbc937e93b2eee7fc63dd5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jun 2023 21:38:42 -0700 Subject: perf pmus: Add notion of default PMU for JSON events JSON events created in pmu-events.c by jevents.py may not specify a PMU they are associated with, in which case it is implied that it is the first core PMU. Care is needed to select this for regular 'cpu', s390 'cpum_cf' and ARMs many names as at the point the name is first needed the core PMUs list hasn't been initialized. Add a helper in perf_pmus to create this value, in the worst case by scanning sysfs. v2. Add missing close if fdopendir fails. Signed-off-by: Ian Rogers Tested-by: Thomas Richter Cc: Ravi Bangoria Cc: James Clark Cc: Mark Rutland Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Thomas Richter Cc: Alexander Shishkin Cc: Kan Liang Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230623043843.4080180-1-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/pmu.c | 35 ++++++++++++++++------------------- tools/perf/util/pmus.c | 37 ++++++++++++++++++++++++++++++++++++- tools/perf/util/pmus.h | 1 + 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 6142e4710a2f..963c12f910c5 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -741,9 +741,11 @@ out: } struct pmu_add_cpu_aliases_map_data { + /* List being added to. */ struct list_head *head; - const char *name; - const char *cpu_name; + /* If a pmu_event lacks a given PMU the default used. */ + char *default_pmu_name; + /* The PMU that we're searching for events for. */ struct perf_pmu *pmu; }; @@ -752,37 +754,32 @@ static int pmu_add_cpu_aliases_map_callback(const struct pmu_event *pe, void *vdata) { struct pmu_add_cpu_aliases_map_data *data = vdata; - const char *pname = pe->pmu ? pe->pmu : data->cpu_name; + const char *pname = pe->pmu ?: data->default_pmu_name; - if (data->pmu->is_uncore && pmu_uncore_alias_match(pname, data->name)) - goto new_alias; - - if (strcmp(pname, data->name)) - return 0; - -new_alias: - /* need type casts to override 'const' */ - __perf_pmu__new_alias(data->head, -1, (char *)pe->name, (char *)pe->desc, - (char *)pe->event, pe); + if (!strcmp(pname, data->pmu->name) || + (data->pmu->is_uncore && pmu_uncore_alias_match(pname, data->pmu->name))) { + /* need type casts to override 'const' */ + __perf_pmu__new_alias(data->head, -1, (char *)pe->name, (char *)pe->desc, + (char *)pe->event, pe); + } return 0; } /* - * From the pmu_events_map, find the table of PMU events that corresponds - * to the current running CPU. Then, add all PMU events from that table - * as aliases. + * From the pmu_events_table, find the events that correspond to the given + * PMU and add them to the list 'head'. */ void pmu_add_cpu_aliases_table(struct list_head *head, struct perf_pmu *pmu, - const struct pmu_events_table *table) + const struct pmu_events_table *table) { struct pmu_add_cpu_aliases_map_data data = { .head = head, - .name = pmu->name, - .cpu_name = is_sysfs_pmu_core(pmu->name) ? pmu->name : "cpu", + .default_pmu_name = perf_pmus__default_pmu_name(), .pmu = pmu, }; pmu_events_table_for_each_event(table, pmu_add_cpu_aliases_map_callback, &data); + free(data.default_pmu_name); } static void pmu_add_cpu_aliases(struct list_head *head, struct perf_pmu *pmu) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index d891d72c824e..0866dee3fc62 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -137,8 +137,10 @@ static void pmu_read_sysfs(bool core_only) return; dir = fdopendir(fd); - if (!dir) + if (!dir) { + close(fd); return; + } while ((dent = readdir(dir))) { if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) @@ -524,6 +526,39 @@ bool perf_pmus__supports_extended_type(void) return perf_pmus__do_support_extended_type; } +char *perf_pmus__default_pmu_name(void) +{ + int fd; + DIR *dir; + struct dirent *dent; + char *result = NULL; + + if (!list_empty(&core_pmus)) + return strdup(list_first_entry(&core_pmus, struct perf_pmu, list)->name); + + fd = perf_pmu__event_source_devices_fd(); + if (fd < 0) + return strdup("cpu"); + + dir = fdopendir(fd); + if (!dir) { + close(fd); + return strdup("cpu"); + } + + while ((dent = readdir(dir))) { + if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) + continue; + if (is_pmu_core(dent->d_name)) { + result = strdup(dent->d_name); + break; + } + } + + closedir(dir); + return result ?: strdup("cpu"); +} + struct perf_pmu *evsel__find_pmu(const struct evsel *evsel) { struct perf_pmu *pmu = evsel->pmu; diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h index d02ffea5d3a4..a21464432d0f 100644 --- a/tools/perf/util/pmus.h +++ b/tools/perf/util/pmus.h @@ -20,5 +20,6 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p bool perf_pmus__have_event(const char *pname, const char *name); int perf_pmus__num_core_pmus(void); bool perf_pmus__supports_extended_type(void); +char *perf_pmus__default_pmu_name(void); #endif /* __PMUS_H */ -- cgit v1.2.3 From d06593aa00b2bb1cc1ac9d88157bb8db0ac17872 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 22 Jun 2023 21:38:43 -0700 Subject: perf pmu: Remove a hard coded cpu PMU assumption The property of "cpu" when it has no cpu map is true on S390 with the PMU cpum_cf. Rather than maintain a list of such PMUs, reuse the is_core test result from the caller. Signed-off-by: Ian Rogers Tested-by: Thomas Richter Cc: Ravi Bangoria Cc: James Clark Cc: Mark Rutland Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Kan Liang Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20230623043843.4080180-2-irogers@google.com Signed-off-by: Namhyung Kim --- tools/perf/util/pmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 963c12f910c5..64fa568a5426 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -551,7 +551,7 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias, * Uncore PMUs have a "cpumask" file under sysfs. CPU PMUs (e.g. on arm/arm64) * may have a "cpus" file. */ -static struct perf_cpu_map *pmu_cpumask(int dirfd, const char *name) +static struct perf_cpu_map *pmu_cpumask(int dirfd, const char *name, bool is_core) { struct perf_cpu_map *cpus; const char *templates[] = { @@ -575,7 +575,8 @@ static struct perf_cpu_map *pmu_cpumask(int dirfd, const char *name) return cpus; } - return !strcmp(name, "cpu") ? perf_cpu_map__get(cpu_map__online()) : NULL; + /* Nothing found, for core PMUs assume this means all CPUs. */ + return is_core ? perf_cpu_map__get(cpu_map__online()) : NULL; } static bool pmu_is_uncore(int dirfd, const char *name) @@ -886,7 +887,8 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char if (!pmu) return NULL; - pmu->cpus = pmu_cpumask(dirfd, name); + pmu->is_core = is_pmu_core(name); + pmu->cpus = pmu_cpumask(dirfd, name, pmu->is_core); pmu->name = strdup(name); if (!pmu->name) goto err; @@ -903,7 +905,6 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char } pmu->type = type; - pmu->is_core = is_pmu_core(name); pmu->is_uncore = pmu_is_uncore(dirfd, name); if (pmu->is_uncore) pmu->id = pmu_id(name); -- cgit v1.2.3 From 8d3df7c39b10d4ff24a605f7b80bb6fefb990798 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 23 Jun 2023 16:01:38 -0700 Subject: perf test: Reorder event name checks in stat STD output linter On AMD machines, the perf stat STD output test failed like below: $ sudo ./perf test -v 98 98: perf stat STD output linter : --- start --- test child forked, pid 1841901 Checking STD output: no argswrong event metric. expected 'GHz' in 108,121 stalled-cycles-frontend # 10.88% frontend cycles idle test child finished with -1 ---- end ---- perf stat STD output linter: FAILED! This is because there are stalled-cycles-{frontend,backend} events are used by default. The current logic checks the event_name array to find which event it's running. But 'cycles' event comes before those stalled cycles event and it matches first. So it tries to find 'GHz' metric in the output (which is for the 'cycles') and fails. Move the stalled-cycles-{frontend,backend} events before 'cycles' so that it can find the stalled cycles events first. Also add a space after 'no args' test name for consistency. Fixes: 99a04a48f225 ("perf test: Add test case for the standard 'perf stat' output") Acked-by: Ian Rogers Cc: Kan Liang Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230623230139.985594-1-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/tests/shell/lib/stat_output.sh | 2 +- tools/perf/tests/shell/stat+std_output.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/lib/stat_output.sh b/tools/perf/tests/shell/lib/stat_output.sh index 363979b1123d..698343f0ecf9 100644 --- a/tools/perf/tests/shell/lib/stat_output.sh +++ b/tools/perf/tests/shell/lib/stat_output.sh @@ -9,7 +9,7 @@ function ParanoidAndNotRoot() # $1 name $2 extra_opt check_no_args() { - echo -n "Checking $1 output: no args" + echo -n "Checking $1 output: no args " perf stat $2 true commachecker --no-args echo "[Success]" diff --git a/tools/perf/tests/shell/stat+std_output.sh b/tools/perf/tests/shell/stat+std_output.sh index 98cc3356a04a..1f70aab45184 100755 --- a/tools/perf/tests/shell/stat+std_output.sh +++ b/tools/perf/tests/shell/stat+std_output.sh @@ -10,8 +10,8 @@ set -e stat_output=$(mktemp /tmp/__perf_test.stat_output.std.XXXXX) -event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults cycles instructions branches branch-misses stalled-cycles-frontend stalled-cycles-backend) -event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "GHz" "insn per cycle" "/sec" "of all branches" "frontend cycles idle" "backend cycles idle") +event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults stalled-cycles-frontend stalled-cycles-backend cycles instructions branches branch-misses) +event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "frontend cycles idle" "backend cycles idle" "GHz" "insn per cycle" "/sec" "of all branches") metricgroup_name=(TopdownL1 TopdownL2) -- cgit v1.2.3 From 4d60e83dfcee794213878155463d8f7353a80864 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 23 Jun 2023 16:01:39 -0700 Subject: perf test: Skip metrics w/o event name in stat STD output linter This test checks if the output of perf stat to match event names and metrics. So it wants the output lines to have both event name and metric. Otherwise it should skip the line. On AMD machines, the instruction event has two metrics and they are printed in separate lines. It makes the line without event name like below: # perf stat -a sleep 1 Performance counter stats for 'system wide': 64,383.34 msec cpu-clock # 64.048 CPUs utilized 14,526 context-switches # 225.617 /sec 112 cpu-migrations # 1.740 /sec 190 page-faults # 2.951 /sec 807,558,652 cycles # 0.013 GHz (83.30%) 69,809,799 stalled-cycles-frontend # 8.64% frontend cycles idle (83.30%) 196,983,266 stalled-cycles-backend # 24.39% backend cycles idle (83.30%) 424,876,008 instructions # 0.53 insn per cycle (here) ---> # 0.46 stalled cycles per insn (83.30%) 97,788,321 branches # 1.519 M/sec (83.34%) 4,147,377 branch-misses # 4.24% of all branches (83.46%) 1.005241409 seconds time elapsed Also modern Intel machines have TopDown metrics which also don't have event names. # perf stat -a sleep 1 Performance counter stats for 'system wide': 8,015.39 msec cpu-clock # 7.996 CPUs utilized 5,823 context-switches # 726.477 /sec 189 cpu-migrations # 23.580 /sec 139 page-faults # 17.342 /sec 435,139,308 cycles # 0.054 GHz 193,891,345 instructions # 0.45 insn per cycle 42,773,028 branches # 5.336 M/sec 2,298,113 branch-misses # 5.37% of all branches TopdownL1 # 25.5 % tma_backend_bound /--> # 7.9 % tma_bad_speculation (here) --+ # 55.7 % tma_frontend_bound \--> # 10.9 % tma_retiring 1.002395924 seconds time elapsed There is a check to skip TopdownL1 and TopdownL2 specifically but it does not cover every affected lines. So there is another check to skip the line if it has nothing on the left side of # sign. Well.. it seems ok but that's not enough too. When aggregation mode (like --per-socket or --per-thread) is used, it adds some prefix (e.g. CPU socket, task name and PID) in the output line. So the test code ignores them to normalize result. A problem can happen for per-thread mode when task name contains one or more spaces. It'd only ignore the first part of the task name, and it thinks there's something more in the line so it would not skip. # perf stat -a --perf-thread sleep 1 ... perf-21276 # 70.2 % tma_backend_bound perf-21276 # 3.9 % tma_bad_speculation perf-21276 # 10.5 % tma_frontend_bound perf-21276 # 15.3 % tma_retiring ^^^^^^^^^^ (ignored) my task-21328 # 70.2 % tma_backend_bound my task-21328 # 3.9 % tma_bad_speculation my task-21328 # 10.5 % tma_frontend_bound my task-21328 # 15.3 % tma_retiring ^^ (ignored) So I think it should look at the metric names instead. Add skip_metric to hold the list of names to skip. It would contain 'stalled cycles per insn' and metrics started by 'tma_'. Fixes: 99a04a48f225 ("perf test: Add test case for the standard 'perf stat' output") Acked-by: Ian Rogers Cc: Kan Liang Cc: Peter Zijlstra Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Ingo Molnar Link: https://lore.kernel.org/r/20230623230139.985594-2-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/tests/shell/stat+std_output.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/tests/shell/stat+std_output.sh b/tools/perf/tests/shell/stat+std_output.sh index 1f70aab45184..f972b31fa0c2 100755 --- a/tools/perf/tests/shell/stat+std_output.sh +++ b/tools/perf/tests/shell/stat+std_output.sh @@ -12,8 +12,7 @@ stat_output=$(mktemp /tmp/__perf_test.stat_output.std.XXXXX) event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults stalled-cycles-frontend stalled-cycles-backend cycles instructions branches branch-misses) event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "frontend cycles idle" "backend cycles idle" "GHz" "insn per cycle" "/sec" "of all branches") - -metricgroup_name=(TopdownL1 TopdownL2) +skip_metric=("stalled cycles per insn" "tma_") cleanup() { rm -f "${stat_output}" @@ -58,13 +57,14 @@ function commachecker() main_body=$(echo $line | cut -d' ' -f$prefix-) x=${main_body%#*} - # Check default metricgroup - y=$(echo $x | tr -d ' ') - [ "$y" = "" ] && continue - for i in "${!metricgroup_name[@]}"; do - [[ "$y" == *"${metricgroup_name[$i]}"* ]] && break + [ "$x" = "" ] && continue + + # Skip metrics without event name + y=${main_body#*#} + for i in "${!skip_metric[@]}"; do + [[ "$y" == *"${skip_metric[$i]}"* ]] && break done - [[ "$y" == *"${metricgroup_name[$i]}"* ]] && continue + [[ "$y" == *"${skip_metric[$i]}"* ]] && continue # Check default event for i in "${!event_name[@]}"; do -- cgit v1.2.3