From e5f4e68eed85fa8495d78cd966eecc2b27bb9e53 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Mon, 3 Apr 2023 14:11:38 -0700 Subject: tools/power turbostat: Fix added raw MSR output When using --Summary mode, added MSRs in raw mode always print zeros. Print the actual register contents. Example, with patch: note the added column: --add msr0x64f,u32,package,raw,REASON Where: 0x64F is MSR_CORE_PERF_LIMIT_REASONS Busy% Bzy_MHz PkgTmp PkgWatt CorWatt REASON 0.00 4800 35 1.42 0.76 0x00000000 0.00 4801 34 1.42 0.76 0x00000000 80.08 4531 66 108.17 107.52 0x08000000 98.69 4530 66 133.21 132.54 0x08000000 99.28 4505 66 128.26 127.60 0x0c000400 99.65 4486 68 124.91 124.25 0x0c000400 99.63 4483 68 124.90 124.25 0x0c000400 79.34 4481 41 99.80 99.13 0x0c000000 0.00 4801 41 1.40 0.73 0x0c000000 Where, for the test processor (i5-10600K): PKG Limit #1: 125.000 Watts, 8.000000 sec MSR bit 26 = log; bit 10 = status PKG Limit #2: 136.000 Watts, 0.002441 sec MSR bit 27 = log; bit 11 = status Example, without patch: Busy% Bzy_MHz PkgTmp PkgWatt CorWatt REASON 0.01 4800 35 1.43 0.77 0x00000000 0.00 4801 35 1.39 0.73 0x00000000 83.49 4531 66 112.71 112.06 0x00000000 98.69 4530 68 133.35 132.69 0x00000000 99.31 4500 67 127.96 127.30 0x00000000 99.63 4483 69 124.91 124.25 0x00000000 99.61 4481 69 124.90 124.25 0x00000000 99.61 4481 71 124.92 124.25 0x00000000 59.35 4479 42 75.03 74.37 0x00000000 0.00 4800 42 1.39 0.73 0x00000000 0.00 4801 42 1.42 0.76 0x00000000 c000000 [lenb: simplified patch to apply only to package scope] Signed-off-by: Doug Smythies Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a334377f92b..fca7913f6c84 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2444,9 +2444,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) - continue; - average.packages.counter[i] += p->counter[i]; + if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.counter[i] = p->counter[i]; + else + average.packages.counter[i] += p->counter[i]; } return 0; } -- cgit v1.2.3 From 3ac1d14d0583a2de75d49a5234d767e2590384dd Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Tue, 3 Oct 2023 05:07:51 +0000 Subject: tools/power turbostat: Increase the limit for fd opened When running turbostat, a system with 512 cpus reaches the limit for maximum number of file descriptors that can be opened. To solve this problem, the limit is raised to 2^15, which is a large enough number. Below data is collected from AMD server systems while running turbostat: |-----------+-------------------------------| | # of cpus | # of opened fds for turbostat | |-----------+-------------------------------| | 128 | 260 | |-----------+-------------------------------| | 192 | 388 | |-----------+-------------------------------| | 512 | 1028 | |-----------+-------------------------------| So, the new max limit would be sufficient up to 2^14 cpus (but this also depends on how many counters are enabled). Reviewed-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Wyes Karny Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fca7913f6c84..2550a0e35914 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -53,6 +53,8 @@ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define MAX_NOFILE 0x8000 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; @@ -6705,6 +6707,22 @@ void cmdline(int argc, char **argv) } } +void set_rlimit(void) +{ + struct rlimit limit; + + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to get rlimit"); + + if (limit.rlim_max < MAX_NOFILE) + limit.rlim_max = MAX_NOFILE; + if (limit.rlim_cur < MAX_NOFILE) + limit.rlim_cur = MAX_NOFILE; + + if (setrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to set rlimit"); +} + int main(int argc, char **argv) { int fd, ret; @@ -6730,6 +6748,9 @@ skip_cgroup_setting: probe_sysfs(); + if (!getuid()) + set_rlimit(); + turbostat_init(); msr_sum_record(); -- cgit v1.2.3 From 0b13410b52c4636aacb6964a4253a797c0fa0d16 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sat, 7 Oct 2023 13:46:22 +0800 Subject: tools/power turbostat: Fix Bzy_MHz documentation typo The code calculates Bzy_MHz by multiplying TSC_delta * APERF_delta/MPERF_delta The man page erroneously showed that TSC_delta was divided. Signed-off-by: Peng Liu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8f08c3fd498d..1ba6340d3b3d 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -370,7 +370,7 @@ below the processor's base frequency. Busy% = MPERF_delta/TSC_delta -Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval +Bzy_MHz = TSC_delta*APERF_delta/MPERF_delta/measurement_interval Note that these calculations depend on TSC_delta, so they are not reliable during intervals when TSC_MHz is not running at the base frequency. -- cgit v1.2.3 From 227ed18f456a68bbb69807294a9089208663a6d3 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sun, 22 Oct 2023 13:52:21 +0800 Subject: tools/power turbostat: Do not print negative LPI residency turbostat prints the abnormal SYS%LPI across suspend-to-idle: SYS%LPI = 114479815993277.50 This is reproduced by: Run a freeze cycle, e.g. "sleepgraph -m freeze -rtcwake 15". Then do a reboot. After boot up, launch the suspend-idle-idle and check the SYS%LPI field. The slp_so residence counter is in LPIT table, and BIOS does not clears this register across reset. The PMC expects the OS to calculate the LPI residency based on the delta. However, there is an firmware issue that the LPIT gets cleared to 0 during the second suspend to idle after the reboot, which brings negative delta value. [lenb: updated to print "neg" upon this BIOS failure] Reported-by: Todd Brandt Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2550a0e35914..c23703dd54aa 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -991,8 +991,8 @@ struct pkg_data { unsigned long long pc8; unsigned long long pc9; unsigned long long pc10; - unsigned long long cpu_lpi; - unsigned long long sys_lpi; + long long cpu_lpi; + long long sys_lpi; unsigned long long pkg_wtd_core_c0; unsigned long long pkg_any_core_c0; unsigned long long pkg_any_gfxe_c0; @@ -1978,12 +1978,22 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); - if (DO_BIC(BIC_CPU_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); - if (DO_BIC(BIC_SYS_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { + if (p->cpu_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } + if (DO_BIC(BIC_SYS_LPI)) { + if (p->sys_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->sys_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } if (DO_BIC(BIC_PkgWatt)) outp += @@ -3832,7 +3842,8 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); - fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, + topo.allowed_cpus); } void set_max_cpu_num(void) @@ -6145,6 +6156,7 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } + void setup_all_buffers(bool startup) { topology_probe(startup); -- cgit v1.2.3 From bb6181fa6bc942aac3f7f2fa8e3831952a2ef118 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 20 Dec 2023 13:11:05 -0500 Subject: tools/power turbostat: Expand probe_intel_uncore_frequency() Print current frequency along with the current (and initial) limits Probe and print uncore config also for machines using the new cluster API Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 84 ++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c23703dd54aa..bbd2e0edadfa 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4581,20 +4581,15 @@ static void dump_sysfs_file(char *path) static void probe_intel_uncore_frequency(void) { int i, j; - char path[128]; + char path[256]; if (!genuine_intel) return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) - return; - - /* Cluster level sysfs not supported yet. */ - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) - return; + if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) + goto probe_cluster; - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - BIC_PRESENT(BIC_UNCORE_MHZ); + BIC_PRESENT(BIC_UNCORE_MHZ); if (quiet) return; @@ -4602,26 +4597,73 @@ static void probe_intel_uncore_frequency(void) for (i = 0; i < topo.num_packages; ++i) { for (j = 0; j < topo.num_die; ++j) { int k, l; + char path_base[128]; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, + j); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz", - i, j); + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz", - i, j); + sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz", - i, j); + sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz", - i, j); + sprintf(path, "%s/initial_max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); } } + return; + +probe_cluster: + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) + return; + + if (quiet) + return; + + for (i = 0;; ++i) { + int k, l; + char path_base[128]; + int package_id, domain_id, cluster_id; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + + if (access(path_base, R_OK)) + break; + + sprintf(path, "%s/package_id", path_base); + package_id = read_sysfs_int(path); + + sprintf(path, "%s/domain_id", path_base); + domain_id = read_sysfs_int(path); + + sprintf(path, "%s/fabric_cluster_id", path_base); + cluster_id = read_sysfs_int(path); + + sprintf(path, "%s/min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, + cluster_id, k / 1000, l / 1000); + + sprintf(path, "%s/initial_min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/initial_max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); + } } static void probe_graphics(void) -- cgit v1.2.3 From fb5ceca046efc84f69fcf9779a013f8a0e63bbff Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 12 Jan 2024 13:48:14 +0100 Subject: tools/power turbostat: Print ucode revision only if valid If the MSR read were to fail, turbostat would print "microcode 0x0" Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bbd2e0edadfa..a4a40a6e1b95 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5679,6 +5679,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned long long ucode_patch = 0; + bool ucode_patch_valid = false; eax = ebx = ecx = edx = 0; @@ -5708,6 +5709,8 @@ void process_cpuid() if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; /* * check max extended function levels of CPUID. @@ -5718,9 +5721,12 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, - (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", + family, model, stepping, family, model, stepping); + if (ucode_patch_valid) + fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fputc('\n', outf); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", -- cgit v1.2.3 From 538d505fde20393bce1e6fb95cec82b56cdd22ef Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jan 2024 15:22:35 +0100 Subject: tools/power turbostat: Read base_hz and bclk from CPUID.16H if available If MSRs cannot be read, values can be obtained from cpuid. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a4a40a6e1b95..c35c48b6a99a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5848,6 +5848,15 @@ void process_cpuid() base_mhz = max_mhz = bus_mhz = edx = 0; __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx); + + bclk = bus_mhz; + + base_hz = base_mhz * 1000000; + has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; + if (!quiet) fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", base_mhz, max_mhz, bus_mhz); -- cgit v1.2.3 From b6fe938317eed58e8c687bd5965a956e15fb5828 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Jan 2024 12:25:42 -0600 Subject: tools/power turbostat: Fix warning upon failed /dev/cpu_dma_latency read Previously a failed read of /dev/cpu_dma_latency erroneously complained turbostat: capget(CAP_SYS_ADMIN) failed, try "# setcap cap_sys_admin=ep ./turbostat This went unnoticed because this file is typically visible to root, and turbostat was typically run as root. Going forward, when a non-root user can run turbostat... Complain about failed read access to this file only if --debug is used. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c35c48b6a99a..531f37e5f92a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5545,7 +5545,8 @@ void print_dev_latency(void) fd = open(path, O_RDONLY); if (fd < 0) { - warnx("capget(CAP_SYS_ADMIN) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); + if (debug) + warnx("Read %s failed", path); return; } -- cgit v1.2.3 From 2d2ccd57338779469777d4319152151272994182 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 5 Feb 2024 15:56:25 -0600 Subject: tools/power turbostat: enhance -D (debug counter dump) output Eliminate redundant debug output for core and package scope counters. Include name and path for all "ADDED" counters. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 531f37e5f92a..60432753fe6a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1673,11 +1673,13 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); + outp += + sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + t->counter[i], mp->path); } } - if (c) { + if (c && is_cpu_first_thread_in_core(t, c, p)) { outp += sprintf(outp, "core: %d\n", c->core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); @@ -1687,12 +1689,14 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "Joules: %0X\n", c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); + outp += + sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + c->counter[i], mp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } - if (p) { + if (p && is_cpu_first_core_in_package(t, c, p)) { outp += sprintf(outp, "package: %d\n", p->package_id); outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); @@ -1721,7 +1725,9 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); + outp += + sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + p->counter[i], mp->path); } } -- cgit v1.2.3 From 3e4048466c396cff52c6d435156dbcd0571e4381 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 15:48:09 +0100 Subject: tools/power turbostat: Add --no-msr option Add --no-msr option to allow users to run turbostat without accessing MSRs via the MSR driver. Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 + tools/power/x86/turbostat/turbostat.c | 205 ++++++++++++++++++++++++---------- 2 files changed, 151 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 1ba6340d3b3d..28be452fbfe2 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -67,6 +67,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP ++\fB--no-msr\fP Disable all the uses of the MSR driver. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 60432753fe6a..4d5437a9725b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -36,6 +36,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -265,6 +266,7 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +bool no_msr; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1282,13 +1284,36 @@ int get_msr_fd(int cpu) sprintf(pathname, "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, " + "or run with --no-msr, or run as root", pathname); fd_percpu[cpu] = fd; return fd; } +static void bic_disable_msr_access(void) +{ + const unsigned long bic_msrs = + BIC_Avg_MHz | + BIC_Busy | + BIC_Bzy_MHz | + BIC_SMI | + BIC_CPU_c1 | + BIC_CPU_c3 | + BIC_CPU_c6 | + BIC_CPU_c7 | + BIC_Mod_c6 | + BIC_CoreTmp | + BIC_Totl_c0 | + BIC_Any_c0 | + BIC_GFX_c0 | + BIC_CPUGFX | + BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + + bic_enabled &= ~bic_msrs; +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); @@ -1328,6 +1353,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; + assert(!no_msr); + retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset); if (retval != sizeof *msr) @@ -1371,6 +1398,7 @@ void help(void) " Override default 5-second measurement interval\n" " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" + " -M, --no-msr Disable all uses of the MSR driver\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -2597,6 +2625,7 @@ unsigned long long snapshot_sysfs_counter(char *path) int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) { if (mp->msr_num != 0) { + assert(!no_msr); if (get_msr(cpu, mp->msr_num, counterp)) return -1; } else { @@ -2646,6 +2675,9 @@ int get_epb(int cpu) return epb; msr_fallback: + if (no_msr) + return -1; + get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); return msr & 0xf; @@ -2865,7 +2897,7 @@ retry: if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (platform->rapl_msrs & RAPL_AMD_F17H) { + if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) { if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) return -14; c->core_energy = msr & 0xFFFFFFFF; @@ -2930,41 +2962,44 @@ retry: if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; + if (!no_msr) { + if (platform->rapl_msrs & RAPL_PKG) { + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) + return -13; + p->energy_pkg = msr; + } + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { + if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) + return -14; + p->energy_cores = msr; + } + if (platform->rapl_msrs & RAPL_DRAM) { + if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) + return -15; + p->energy_dram = msr; + } + if (platform->rapl_msrs & RAPL_GFX) { + if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) + return -16; + p->energy_gfx = msr; + } + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { + if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) + return -16; + p->rapl_pkg_perf_status = msr; + } + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { + if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) + return -16; + p->rapl_dram_perf_status = msr; + } + if (platform->rapl_msrs & RAPL_AMD_F17H) { + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) + return -13; + p->energy_pkg = msr; + } } + if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; @@ -3072,7 +3107,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; switch (platform->cst_limit) { @@ -3116,7 +3151,7 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); @@ -3134,7 +3169,7 @@ static void dump_power_ctl(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); @@ -3340,7 +3375,7 @@ static void dump_cst_cfg(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); @@ -3412,7 +3447,7 @@ void print_irtl(void) { unsigned long long msr; - if (!platform->has_irtl_msrs) + if (!platform->has_irtl_msrs || no_msr) return; if (platform->supported_cstates & PC3) { @@ -4193,6 +4228,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) int ret, idx; unsigned long long msr_cur, msr_last; + assert(!no_msr); + if (!per_cpu_msr_sum) return 1; @@ -4221,6 +4258,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg UNUSED(c); UNUSED(p); + assert(!no_msr); + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; off_t offset; @@ -4465,7 +4504,7 @@ void check_permissions(void) sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr"); + warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, or run with --no-msr"); } /* if all else fails, thell them to be root */ @@ -4482,7 +4521,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4522,7 +4561,7 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->trl_msrs & TRL_LIMIT2) @@ -4845,6 +4884,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!has_hwp) return 0; @@ -4931,6 +4973,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + cpu = t->cpu_id; /* per-package */ @@ -5264,7 +5309,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) */ void probe_rapl(void) { - if (!platform->rapl_msrs) + if (!platform->rapl_msrs || no_msr) return; if (genuine_intel) @@ -5320,7 +5365,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5367,6 +5412,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!(do_dts || do_ptm)) return 0; @@ -5464,6 +5512,9 @@ void decode_feature_control_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); @@ -5473,6 +5524,9 @@ void decode_misc_enable_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!genuine_intel) return; @@ -5490,6 +5544,9 @@ void decode_misc_feature_control(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_feature_control) return; @@ -5511,6 +5568,9 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_pwr_mgmt) return; @@ -5530,6 +5590,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_c6_demotion_policy_config) return; @@ -5626,7 +5689,7 @@ void probe_cstates(void) if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (platform->has_ext_cst_msrs) { + if (platform->has_ext_cst_msrs && !no_msr) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); @@ -5714,10 +5777,12 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; - if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) - warnx("get_msr(UCODE)"); - else - ucode_patch_valid = true; + if (!no_msr) { + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; + } /* * check max extended function levels of CPUID. @@ -5892,7 +5957,7 @@ void probe_pm_features(void) probe_thermal(); - if (platform->has_nhm_msrs) + if (platform->has_nhm_msrs && !no_msr) BIC_PRESENT(BIC_SMI); if (!quiet) @@ -6252,8 +6317,10 @@ void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - check_dev_msr(); - check_permissions(); + if (!no_msr) { + check_dev_msr(); + check_permissions(); + } process_cpuid(); probe_pm_features(); linux_perf_init(); @@ -6370,6 +6437,9 @@ int add_counter(unsigned int msr_num, char *path, char *name, { struct msr_counter *msrp; + if (no_msr && msr_num) + errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); + msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) { perror("calloc"); @@ -6674,6 +6744,7 @@ void cmdline(int argc, char **argv) { "list", no_argument, 0, 'l' }, { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, + { "no-msr", no_argument, 0, 'M' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6683,7 +6754,22 @@ void cmdline(int argc, char **argv) progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { + /* + * Parse some options early, because they may make other options invalid, + * like adding the MSR counter with --add and at the same time using --no-msr. + */ + while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) { + switch (opt) { + case 'M': + no_msr = 1; + break; + default: + break; + } + } + optind = 0; + + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6741,6 +6827,9 @@ void cmdline(int argc, char **argv) case 'q': quiet = 1; break; + case 'M': + /* Parsed earlier */ + break; case 'n': num_iterations = strtod(optarg, NULL); @@ -6817,6 +6906,9 @@ skip_cgroup_setting: outf = stderr; cmdline(argc, argv); + if (no_msr) + bic_disable_msr_access(); + if (!quiet) { print_version(); print_bootcmd(); @@ -6829,7 +6921,8 @@ skip_cgroup_setting: turbostat_init(); - msr_sum_record(); + if (!no_msr) + msr_sum_record(); /* dump counters and exit */ if (dump_only) -- cgit v1.2.3 From a0e86c90b83c118985260e36490583b5a38d4359 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 15:58:02 +0100 Subject: tools/power turbostat: Add --no-perf option Add the --no-perf option to allow users to run turbostat without accessing perf. Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 ++ tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 28be452fbfe2..567327b004e6 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -69,6 +69,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP ++\fB--no-perf\fP Disable all the uses of the perf API. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4d5437a9725b..bad2fec7f342 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -267,6 +267,7 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; +bool no_perf; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1314,8 +1315,17 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; } +static void bic_disable_perf_access(void) +{ + const unsigned long bic_perf = BIC_IPC; + + bic_enabled &= ~bic_perf; +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { + assert(!no_perf); + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } @@ -1332,8 +1342,8 @@ static int perf_instr_count_open(int cpu_num) /* counter for cpu_num, including user + kernel and all processes */ fd = perf_event_open(&pea, -1, cpu_num, -1, 0); if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); - BIC_NOT_PRESENT(BIC_IPC); + warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\" or use --no-perf", progname); + bic_disable_perf_access(); } return fd; @@ -1399,6 +1409,7 @@ void help(void) " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" " -M, --no-msr Disable all uses of the MSR driver\n" + " -P, --no-perf Disable all uses of the perf API\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -6745,6 +6756,7 @@ void cmdline(int argc, char **argv) { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, { "no-msr", no_argument, 0, 'M' }, + { "no-perf", no_argument, 0, 'P' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6758,11 +6770,14 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; break; + case 'P': + no_perf = 1; + break; default: break; } @@ -6828,6 +6843,7 @@ void cmdline(int argc, char **argv) quiet = 1; break; case 'M': + case 'P': /* Parsed earlier */ break; case 'n': @@ -6909,6 +6925,9 @@ skip_cgroup_setting: if (no_msr) bic_disable_msr_access(); + if (no_perf) + bic_disable_perf_access(); + if (!quiet) { print_version(); print_bootcmd(); -- cgit v1.2.3 From e48934c9f1048ed4640b60321baf1986d1a470e1 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 16:42:22 +0100 Subject: tools/power turbostat: Add reading aperf and mperf via perf API By using the perf API we spend less time in between the reads of the counters, resulting in more accurate calculations of the dependent metrics. Using perf API is also usually faster overall, although cache miss, if we get one, is more costly when using perf vs MSR driver. We would fallback to the msr reads if the sysfs isn't there or when in --no-perf mode. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 374 +++++++++++++++++++++++++++------- 1 file changed, 301 insertions(+), 73 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bad2fec7f342..1294c46c2170 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -59,6 +59,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -207,10 +208,13 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +struct amperf_group_fd; + char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; +struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -268,6 +272,7 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; +enum amperf_source amperf_source; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1296,9 +1301,6 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { const unsigned long bic_msrs = - BIC_Avg_MHz | - BIC_Busy | - BIC_Bzy_MHz | BIC_SMI | BIC_CPU_c1 | BIC_CPU_c3 | @@ -1329,21 +1331,30 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int perf_instr_count_open(int cpu_num) +static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { - struct perf_event_attr pea; - int fd; + struct perf_event_attr attr; + const pid_t pid = -1; + const unsigned long flags = 0; + + memset(&attr, 0, sizeof(struct perf_event_attr)); - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = PERF_COUNT_HW_INSTRUCTIONS; + attr.type = type; + attr.size = sizeof(struct perf_event_attr); + attr.config = config; + attr.disabled = 0; + attr.sample_type = PERF_SAMPLE_IDENTIFIER; + attr.read_format = read_format; - /* counter for cpu_num, including user + kernel and all processes */ - fd = perf_event_open(&pea, -1, cpu_num, -1, 0); + const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\" or use --no-perf", progname); - bic_disable_perf_access(); + if (errno == EACCES) { + errx(1, "capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"" + " or use --no-perf or run as root", progname); + } else { + perror("perf_event_open"); + errx(1, "use --no-perf or run as root"); + } } return fd; @@ -1354,7 +1365,8 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + fd_instr_count_percpu[cpu] = + open_perf_counter_or_fail(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -2762,6 +2774,175 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) return 0; } +struct amperf_group_fd { + int aperf; /* Also the group descriptor */ + int mperf; +}; + +static unsigned int read_perf_counter_info(const char *const path, const char *const parse_format) +{ + int fdmt; + char buf[16]; + unsigned int v; + + fdmt = open(path, O_RDONLY, 0); + if (fdmt == -1) + errx(1, "Failed to read perf counter info %s\n", path); + + if (read(fdmt, buf, sizeof(buf)) <= 0) + return 0; + + buf[sizeof(buf) - 1] = '\0'; + + if (sscanf(buf, parse_format, &v) != 1) + errx(1, "Failed to parse perf counter info %s\n", path); + + close(fdmt); + + return v; +} + +static unsigned read_msr_type(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/type"; + const char *const format = "%u"; + + return read_perf_counter_info(path, format); +} + +static unsigned read_aperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info(path, format); +} + +static unsigned read_mperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info(path, format); +} + +static struct amperf_group_fd open_amperf_fd(int cpu) +{ + const unsigned int msr_type = read_msr_type(); + const unsigned int aperf_config = read_aperf_config(); + const unsigned int mperf_config = read_mperf_config(); + struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; + + fds.aperf = open_perf_counter_or_fail(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter_or_fail(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + + return fds; +} + +static int get_amperf_fd(int cpu) +{ + assert(fd_amperf_percpu); + + if (fd_amperf_percpu[cpu].aperf) + return fd_amperf_percpu[cpu].aperf; + + fd_amperf_percpu[cpu] = open_amperf_fd(cpu); + + return fd_amperf_percpu[cpu].aperf; +} + +/* Read APERF, MPERF and TSC using the perf API. */ +static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) +{ + union { + struct { + unsigned long nr_entries; + unsigned long aperf; + unsigned long mperf; + }; + + unsigned long as_array[3]; + } cnt; + + const int fd_amperf = get_amperf_fd(cpu); + + /* + * Read the TSC with rdtsc, because we want the absolute value and not + * the offset from the start of the counter. + */ + t->tsc = rdtsc(); + + const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + if (n != sizeof(cnt.as_array)) + return -2; + + t->aperf = cnt.aperf * aperf_mperf_multiplier; + t->mperf = cnt.mperf * aperf_mperf_multiplier; + + return 0; +} + +/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ +static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) +{ + unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; + int aperf_mperf_retry_count = 0; + + /* + * The TSC, APERF and MPERF must be read together for + * APERF/MPERF and MPERF/TSC to give accurate results. + * + * Unfortunately, APERF and MPERF are read by + * individual system call, so delays may occur + * between them. If the time to read them + * varies by a large amount, we re-read them. + */ + + /* + * This initial dummy APERF read has been seen to + * reduce jitter in the subsequent reads. + */ + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + +retry: + t->tsc = rdtsc(); /* re-read close to APERF */ + + tsc_before = t->tsc; + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + + tsc_between = rdtsc(); + + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) + return -4; + + tsc_after = rdtsc(); + + aperf_time = tsc_between - tsc_before; + mperf_time = tsc_after - tsc_between; + + /* + * If the system call latency to read APERF and MPERF + * differ by more than 2x, then try again. + */ + if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { + aperf_mperf_retry_count++; + if (aperf_mperf_retry_count < 5) + goto retry; + else + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + } + aperf_mperf_retry_count = 0; + + t->aperf = t->aperf * aperf_mperf_multiplier; + t->mperf = t->mperf * aperf_mperf_multiplier; + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2771,7 +2952,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int cpu = t->cpu_id; unsigned long long msr; - int aperf_mperf_retry_count = 0; struct msr_counter *mp; int i; @@ -2784,63 +2964,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (first_counter_read) get_apic_id(t); -retry: + t->tsc = rdtsc(); /* we are running on local CPU of interest */ if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) || soft_c1_residency_display(BIC_Avg_MHz)) { - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; + int status = -1; - tsc_after = rdtsc(); + assert(!no_perf || !no_msr); - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + switch (amperf_source) { + case AMPERF_SOURCE_PERF: + status = read_aperf_mperf_tsc_perf(t, cpu); + break; + case AMPERF_SOURCE_MSR: + status = read_aperf_mperf_tsc_msr(t, cpu); + break; } - aperf_mperf_retry_count = 0; - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; + if (status != 0) + return status; } if (DO_BIC(BIC_IPC)) @@ -3516,6 +3659,21 @@ void free_fd_percpu(void) free(fd_percpu); } +void free_fd_amperf_percpu(void) +{ + int i; + + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_amperf_percpu[i].mperf != 0) + close(fd_amperf_percpu[i].mperf); + + if (fd_amperf_percpu[i].aperf != 0) + close(fd_amperf_percpu[i].aperf); + } + + free(fd_amperf_percpu); +} + void free_all_buffers(void) { int i; @@ -3557,6 +3715,7 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_amperf_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -5647,17 +5806,62 @@ void print_dev_latency(void) */ void linux_perf_init(void) { - if (!BIC_IS_ENABLED(BIC_IPC)) - return; - if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; - fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); - if (fd_instr_count_percpu == NULL) - err(-1, "calloc fd_instr_count_percpu"); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) { + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + } - BIC_PRESENT(BIC_IPC); + const bool aperf_required = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || + BIC_IS_ENABLED(BIC_Bzy_MHz) || BIC_IS_ENABLED(BIC_IPC); + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { + fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); + if (fd_amperf_percpu == NULL) + err(-1, "calloc fd_amperf_percpu"); + } +} + +static int has_amperf_access_via_msr(void) +{ + const int cpu = sched_getcpu(); + unsigned long long dummy; + + if (get_msr(cpu, MSR_IA32_APERF, &dummy)) + return 0; + + if (get_msr(cpu, MSR_IA32_MPERF, &dummy)) + return 0; + + return 1; +} + +static int has_amperf_access_via_perf(void) +{ + if (access("/sys/bus/event_source/devices/msr/type", F_OK)) + return 0; + + if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK)) + return 0; + + if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK)) + return 0; + + return 1; +} + +/* Check if we can access APERF and MPERF */ +static int has_amperf_access(void) +{ + if (!no_msr && has_amperf_access_via_msr()) + return 1; + + if (!no_perf && has_amperf_access_via_perf()) + return 1; + + return 0; } void probe_cstates(void) @@ -5845,10 +6049,11 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf) { + if (has_aperf && has_amperf_access()) { BIC_PRESENT(BIC_Avg_MHz); BIC_PRESENT(BIC_Busy); BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_IPC); } do_dts = eax & (1 << 0); if (do_dts) @@ -6324,6 +6529,19 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } +static void set_amperf_source(void) +{ + amperf_source = AMPERF_SOURCE_PERF; + + if (no_perf || !has_amperf_access_via_perf()) + amperf_source = AMPERF_SOURCE_MSR; + + if (quiet || !debug) + return; + + fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); +} + void turbostat_init() { setup_all_buffers(true); @@ -6334,6 +6552,7 @@ void turbostat_init() } process_cpuid(); probe_pm_features(); + set_amperf_source(); linux_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); @@ -6341,6 +6560,15 @@ void turbostat_init() if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); + + /* + * If TSC tweak is needed, but couldn't get it, + * disable more BICs, since it can't be reported accurately. + */ + if (platform->enable_tsc_tweak && !has_base_hz) { + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + } } int fork_it(char **argv) -- cgit v1.2.3 From 5088741ec805cd249e27c7176ed09bdab164960e Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 15 Jan 2024 19:04:21 +0100 Subject: tools/power turbostat: detect and disable unavailable BICs at runtime To allow unprivileged user to run turbostat seamlessly. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 188 ++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 63 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1294c46c2170..30db6e92193a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1317,13 +1317,6 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; } -static void bic_disable_perf_access(void) -{ - const unsigned long bic_perf = BIC_IPC; - - bic_enabled &= ~bic_perf; -} - static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { assert(!no_perf); @@ -1331,12 +1324,14 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) +static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { struct perf_event_attr attr; const pid_t pid = -1; const unsigned long flags = 0; + assert(!no_perf); + memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = type; @@ -1347,15 +1342,6 @@ static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int c attr.read_format = read_format; const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); - if (fd == -1) { - if (errno == EACCES) { - errx(1, "capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"" - " or use --no-perf or run as root", progname); - } else { - perror("perf_event_open"); - errx(1, "use --no-perf or run as root"); - } - } return fd; } @@ -1365,8 +1351,7 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = - open_perf_counter_or_fail(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -2833,8 +2818,8 @@ static struct amperf_group_fd open_amperf_fd(int cpu) const unsigned int mperf_config = read_mperf_config(); struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; - fds.aperf = open_perf_counter_or_fail(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter_or_fail(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); return fds; } @@ -4509,7 +4494,8 @@ release_msr: /* * set_my_sched_priority(pri) - * return previous + * return previous priority on success + * return value < -20 on failure */ int set_my_sched_priority(int priority) { @@ -4519,16 +4505,16 @@ int set_my_sched_priority(int priority) errno = 0; original_priority = getpriority(PRIO_PROCESS, 0); if (errno && (original_priority == -1)) - err(errno, "getpriority"); + return -21; retval = setpriority(PRIO_PROCESS, 0, priority); if (retval) - errx(retval, "capget(CAP_SYS_NICE) failed,try \"# setcap cap_sys_nice=ep %s\"", progname); + return -21; errno = 0; retval = getpriority(PRIO_PROCESS, 0); if (retval != priority) - err(retval, "getpriority(%d) != setpriority(%d)", retval, priority); + return -21; return original_priority; } @@ -4543,6 +4529,9 @@ void turbostat_loop() /* * elevate own priority for interval mode + * + * ignore on error - we probably don't have permission to set it, but + * it's not a big deal */ set_my_sched_priority(-20); @@ -4628,10 +4617,13 @@ void check_dev_msr() struct stat sb; char pathname[32]; + if (no_msr) + return; + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) if (system("/sbin/modprobe msr > /dev/null 2>&1")) - err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + no_msr = 1; } /* @@ -4643,47 +4635,51 @@ int check_for_cap_sys_rawio(void) { cap_t caps; cap_flag_value_t cap_flag_value; + int ret = 0; caps = cap_get_proc(); if (caps == NULL) - err(-6, "cap_get_proc\n"); + return 1; - if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) - err(-6, "cap_get\n"); + if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) { + ret = 1; + goto free_and_exit; + } if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); - return 1; + ret = 1; + goto free_and_exit; } +free_and_exit: if (cap_free(caps) == -1) err(-6, "cap_free\n"); - return 0; + return ret; } -void check_permissions(void) +void check_msr_permission(void) { - int do_exit = 0; + int failed = 0; char pathname[32]; + if (no_msr) + return; + /* check for CAP_SYS_RAWIO */ - do_exit += check_for_cap_sys_rawio(); + failed += check_for_cap_sys_rawio(); /* test file permissions */ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { - do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, or run with --no-msr"); + failed++; } - /* if all else fails, thell them to be root */ - if (do_exit) - if (getuid() != 0) - warnx("... or simply run as root"); - - if (do_exit) - exit(-6); + if (failed) { + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr"); + no_msr = 1; + } } void probe_bclk(void) @@ -5800,6 +5796,28 @@ void print_dev_latency(void) close(fd); } +static int has_instr_count_access(void) +{ + int fd; + int has_access; + + if (no_perf) + return 0; + + fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + has_access = fd != -1; + + if (fd != -1) + close(fd); + + if (!has_access) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "instructions retired perf counter", "--no-perf"); + + return has_access; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5826,13 +5844,15 @@ void linux_perf_init(void) static int has_amperf_access_via_msr(void) { - const int cpu = sched_getcpu(); unsigned long long dummy; - if (get_msr(cpu, MSR_IA32_APERF, &dummy)) + if (no_msr) + return 0; + + if (get_msr(base_cpu, MSR_IA32_APERF, &dummy)) return 0; - if (get_msr(cpu, MSR_IA32_MPERF, &dummy)) + if (get_msr(base_cpu, MSR_IA32_MPERF, &dummy)) return 0; return 1; @@ -5840,16 +5860,44 @@ static int has_amperf_access_via_msr(void) static int has_amperf_access_via_perf(void) { - if (access("/sys/bus/event_source/devices/msr/type", F_OK)) - return 0; + struct amperf_group_fd fds; - if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK)) - return 0; + /* + * Cache the last result, so we don't warn the user multiple times + * + * Negative means cached, no access + * Zero means not cached + * Positive means cached, has access + */ + static int has_access_cached; - if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK)) + if (no_perf) return 0; - return 1; + if (has_access_cached != 0) + return has_access_cached > 0; + + fds = open_amperf_fd(base_cpu); + has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); + + if (fds.aperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "APERF perf counter", "--no-perf"); + else + close(fds.aperf); + + if (fds.mperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "MPERF perf counter", "--no-perf"); + else + close(fds.mperf); + + if (has_access_cached == 0) + has_access_cached = -1; + + return has_access_cached > 0; } /* Check if we can access APERF and MPERF */ @@ -6542,14 +6590,34 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +void check_msr_access(void) +{ + check_dev_msr(); + check_msr_permission(); + + if (no_msr) + bic_disable_msr_access(); +} + +void check_perf_access(void) +{ + if (no_perf || !has_instr_count_access()) + bic_enabled &= ~BIC_IPC; + + if (!has_amperf_access()) { + bic_enabled &= ~BIC_Avg_MHz; + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + bic_enabled &= ~BIC_IPC; + } +} + void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - if (!no_msr) { - check_dev_msr(); - check_permissions(); - } + check_msr_access(); + check_perf_access(); process_cpuid(); probe_pm_features(); set_amperf_source(); @@ -7150,12 +7218,6 @@ skip_cgroup_setting: outf = stderr; cmdline(argc, argv); - if (no_msr) - bic_disable_msr_access(); - - if (no_perf) - bic_disable_perf_access(); - if (!quiet) { print_version(); print_bootcmd(); -- cgit v1.2.3 From aed48c48fa65abdd584e14f7d0273711bc10d223 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 30 Jan 2024 23:57:07 +0100 Subject: tools/power turbostat: add early exits for permission checks Checking early if the permissions are even needed gets rid of the warnings about some of them missing. Earlier we issued a warning in case of missing MSR and/or perf permissions, even when user never asked for counters that require those. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 66 ++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 30db6e92193a..e5e01b58992e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5818,6 +5818,14 @@ static int has_instr_count_access(void) return has_access; } +bool is_aperf_access_required(void) +{ + return BIC_IS_ENABLED(BIC_Avg_MHz) + || BIC_IS_ENABLED(BIC_Busy) + || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC); +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5833,8 +5841,7 @@ void linux_perf_init(void) err(-1, "calloc fd_instr_count_percpu"); } - const bool aperf_required = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || - BIC_IS_ENABLED(BIC_Bzy_MHz) || BIC_IS_ENABLED(BIC_IPC); + const bool aperf_required = is_aperf_access_required(); if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); if (fd_amperf_percpu == NULL) @@ -5903,6 +5910,9 @@ static int has_amperf_access_via_perf(void) /* Check if we can access APERF and MPERF */ static int has_amperf_access(void) { + if (!is_aperf_access_required()) + return 0; + if (!no_msr && has_amperf_access_via_msr()) return 1; @@ -6581,7 +6591,8 @@ static void set_amperf_source(void) { amperf_source = AMPERF_SOURCE_PERF; - if (no_perf || !has_amperf_access_via_perf()) + const bool aperf_required = is_aperf_access_required(); + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) amperf_source = AMPERF_SOURCE_MSR; if (quiet || !debug) @@ -6590,8 +6601,51 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +bool is_msr_access_required(void) +{ + /* TODO: add detection for dynamic counters from add_counter() */ + if (no_msr) + return false; + + return BIC_IS_ENABLED(BIC_SMI) + || BIC_IS_ENABLED(BIC_CPU_c1) + || BIC_IS_ENABLED(BIC_CPU_c3) + || BIC_IS_ENABLED(BIC_CPU_c6) + || BIC_IS_ENABLED(BIC_CPU_c7) + || BIC_IS_ENABLED(BIC_Mod_c6) + || BIC_IS_ENABLED(BIC_CoreTmp) + || BIC_IS_ENABLED(BIC_Totl_c0) + || BIC_IS_ENABLED(BIC_Any_c0) + || BIC_IS_ENABLED(BIC_GFX_c0) + || BIC_IS_ENABLED(BIC_CPUGFX) + || BIC_IS_ENABLED(BIC_Pkgpc3) + || BIC_IS_ENABLED(BIC_Pkgpc6) + || BIC_IS_ENABLED(BIC_Pkgpc2) + || BIC_IS_ENABLED(BIC_Pkgpc7) + || BIC_IS_ENABLED(BIC_Pkgpc8) + || BIC_IS_ENABLED(BIC_Pkgpc9) + || BIC_IS_ENABLED(BIC_Pkgpc10) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_PkgWatt) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_GFXWatt) + || BIC_IS_ENABLED(BIC_RAMWatt) + || BIC_IS_ENABLED(BIC_Pkg_J) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_GFX_J) + || BIC_IS_ENABLED(BIC_RAM_J) + || BIC_IS_ENABLED(BIC_PKG__) + || BIC_IS_ENABLED(BIC_RAM__) + || BIC_IS_ENABLED(BIC_PkgTmp) + || (is_aperf_access_required() && !has_amperf_access_via_perf()); +} + void check_msr_access(void) { + if (!is_msr_access_required()) + no_msr = 1; + check_dev_msr(); check_msr_permission(); @@ -6601,10 +6655,12 @@ void check_msr_access(void) void check_perf_access(void) { - if (no_perf || !has_instr_count_access()) + const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + if (no_perf || !intrcount_required || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; - if (!has_amperf_access()) { + const bool aperf_required = is_aperf_access_required(); + if (!aperf_required || !has_amperf_access()) { bic_enabled &= ~BIC_Avg_MHz; bic_enabled &= ~BIC_Busy; bic_enabled &= ~BIC_Bzy_MHz; -- cgit v1.2.3 From 4a1bb4dad5d16669e841410944e7bc84ef7263fc Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 14 Mar 2024 11:36:55 +0100 Subject: tools/power turbostat: Clear added counters when in no-msr mode If user request --no-msr or is not able to access the MSRs, turbostat should clear all the counters added with --add. Because MSR access permission checks are done after the cmdline is parsed, the decision has to be defered up until the transition into no-msr mode happen. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 ++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5e01b58992e..b4a892bf22bf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1160,6 +1160,37 @@ struct sys_counters { struct msr_counter *pp; } sys; +void free_sys_counters(void) +{ + struct msr_counter *p = sys.tp, *pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.cp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.pp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + sys.added_thread_counters = 0; + sys.added_core_counters = 0; + sys.added_package_counters = 0; + sys.tp = NULL; + sys.cp = NULL; + sys.pp = NULL; +} + struct system_summary { struct thread_data threads; struct core_data cores; @@ -1315,6 +1346,8 @@ static void bic_disable_msr_access(void) BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; bic_enabled &= ~bic_msrs; + + free_sys_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -6601,12 +6634,24 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +bool has_added_counters(void) +{ + /* + * It only makes sense to call this after the command line is parsed, + * otherwise sys structure is not populated. + */ + + return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; +} + bool is_msr_access_required(void) { - /* TODO: add detection for dynamic counters from add_counter() */ if (no_msr) return false; + if (has_added_counters()) + return true; + return BIC_IS_ENABLED(BIC_SMI) || BIC_IS_ENABLED(BIC_CPU_c1) || BIC_IS_ENABLED(BIC_CPU_c3) -- cgit v1.2.3 From ebf8449caba1df2eb6ba0b465fe15dc06d3b9135 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 15 Feb 2024 12:50:19 +0100 Subject: tools/power turbostat: Add proper re-initialization for perf file descriptors Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b4a892bf22bf..a380829c5890 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3669,18 +3669,25 @@ void free_fd_percpu(void) { int i; + if (!fd_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_percpu[i] != 0) close(fd_percpu[i]); } free(fd_percpu); + fd_percpu = NULL; } void free_fd_amperf_percpu(void) { int i; + if (!fd_amperf_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_amperf_percpu[i].mperf != 0) close(fd_amperf_percpu[i].mperf); @@ -3690,6 +3697,21 @@ void free_fd_amperf_percpu(void) } free(fd_amperf_percpu); + fd_amperf_percpu = NULL; +} + +void free_fd_instr_count_percpu(void) +{ + if (!fd_instr_count_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_instr_count_percpu[i] != 0) + close(fd_instr_count_percpu[i]); + } + + free(fd_instr_count_percpu); + fd_instr_count_percpu = NULL; } void free_all_buffers(void) @@ -3733,6 +3755,7 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free(irq_column_2_cpu); @@ -4067,10 +4090,13 @@ static void update_effective_set(bool startup) err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); } +void linux_perf_init(void); + void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); + linux_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } -- cgit v1.2.3 From 7d8ed162e6a92268d4b2b84d364a931216102c8e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:26:59 +0000 Subject: memblock tests: fix undefined reference to `early_pfn_to_nid' commit 6a9531c3a880 ("memblock: fix crash when reserved memory is not added to memory") introduce the usage of early_pfn_to_nid, which is not defined in memblock tests. The original definition of early_pfn_to_nid is defined in mm.h, so let add this in the corresponding mm.h. Signed-off-by: Wei Yang CC: Yajun Deng CC: Mike Rapoport Link: https://lore.kernel.org/r/20240402132701.29744-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index f3c82ab5b14c..7d73da098047 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -37,4 +37,9 @@ static inline void totalram_pages_add(long count) { } +static inline int early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} + #endif -- cgit v1.2.3 From e0f5a8e74be88f2476e58b25d3b49a9521bdc4ec Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:00 +0000 Subject: memblock tests: fix undefined reference to `panic' commit e96c6b8f212a ("memblock: report failures when memblock_can_resize is not set") introduced the usage of panic, which is not defined in memblock test. Let's define it directly in panic.h to fix it. Signed-off-by: Wei Yang CC: Song Shuai CC: Mike Rapoport Link: https://lore.kernel.org/r/20240402132701.29744-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/include/linux/kernel.h | 1 + tools/include/linux/panic.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tools/include/linux/panic.h (limited to 'tools') diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 4b0673bf52c2..07cfad817d53 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/tools/include/linux/panic.h b/tools/include/linux/panic.h new file mode 100644 index 000000000000..9c8f17a41ce8 --- /dev/null +++ b/tools/include/linux/panic.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_PANIC_H +#define _TOOLS_LINUX_PANIC_H + +#include +#include +#include + +static inline void panic(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + exit(-1); +} + +#endif -- cgit v1.2.3 From 1a4ea83a6e67f1415a1f17c1af5e9c814c882bb5 Mon Sep 17 00:00:00 2001 From: Yuanhe Shu Date: Mon, 26 Feb 2024 11:18:16 +0800 Subject: selftests/ftrace: Limit length in subsystem-enable tests While sched* events being traced and sched* events continuously happen, "[xx] event tracing - enable/disable with subsystem level files" would not stop as on some slower systems it seems to take forever. Select the first 100 lines of output would be enough to judge whether there are more than 3 types of sched events. Fixes: 815b18ea66d6 ("ftracetest: Add basic event tracing test cases") Cc: stable@vger.kernel.org Signed-off-by: Yuanhe Shu Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index b1ede6249866..b7c8f29c09a9 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -18,7 +18,7 @@ echo 'sched:*' > set_event yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -29,7 +29,7 @@ echo 1 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -40,7 +40,7 @@ echo 0 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -ne 0 ]; then fail "any of scheduler events should not be recorded" fi -- cgit v1.2.3 From 72d7cb5c190befbb095bae7737e71560ec0fcaa6 Mon Sep 17 00:00:00 2001 From: Shengyu Li Date: Wed, 27 Mar 2024 05:13:15 +0800 Subject: selftests/harness: Prevent infinite loop due to Assert in FIXTURE_TEARDOWN This patch addresses an issue in the selftests/harness where an assertion within FIXTURE_TEARDOWN could trigger an infinite loop. The problem arises because the teardown procedure is meant to execute once, but the presence of failing assertions (ASSERT_EQ(0, 1)) leads to repeated attempts to execute teardown due to the long jump mechanism used by the harness for handling assertions. To resolve this, the patch ensures that the teardown process runs only once, regardless of assertion outcomes, preventing the infinite loop and allowing tests to fail. A simple test demo(test.c): #include "kselftest_harness.h" FIXTURE(f) { int fd; }; FIXTURE_SETUP(f) { self->fd = 0; } FIXTURE_TEARDOWN(f) { TH_LOG("TEARDOWN"); ASSERT_EQ(0, 1); self->fd = -1; } TEST_F(f, open_close) { ASSERT_NE(self->fd, 1); } TEST_HARNESS_MAIN will always output the following output due to a dead loop until timeout: # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) ... But here's what we should and expect to get: TAP version 13 1..1 # Starting 1 tests from 2 test cases. # RUN f.open_close ... # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # open_close: Test terminated by assertion # FAIL f.open_close not ok 1 f.open_close # FAILED: 0 / 1 tests passed. # Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0 also this is related to the issue mentioned in this patch https://patchwork.kernel.org/project/linux-kselftest/patch/e2ba3f8c-80e6-477d-9cea-1c9af820e0ed@alu.unizg.hr/ Signed-off-by: Shengyu Li Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..230d62884885 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -383,6 +383,7 @@ FIXTURE_DATA(fixture_name) self; \ pid_t child = 1; \ int status = 0; \ + bool jmp = false; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ if (setjmp(_metadata->env) == 0) { \ /* Use the same _metadata. */ \ @@ -399,8 +400,10 @@ _metadata->exit_code = KSFT_FAIL; \ } \ } \ + else \ + jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent) \ + if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ fixture_name##_teardown(_metadata, &self, variant->data); \ _exit(0); \ } \ -- cgit v1.2.3 From 001c5d19341a39cb683ab0a18ce4b662a09d96a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:15 -0700 Subject: cxl: Consolidate dport access_coordinate ->hb_coord and ->sw_coord into ->coord The driver stores access_coordinate for host bridge in ->hb_coord and switch CDAT access_coordinate in ->sw_coord. Since neither of these access_coordinate clobber each other, the variable name can be consolidated into ->coord to simplify the code. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/cdat.c | 74 ++++++++++++++++++++++++++++---------------- drivers/cxl/core/port.c | 48 +++++++++++++++++++++------- drivers/cxl/cxl.h | 6 ++-- drivers/cxl/cxlmem.h | 2 +- tools/testing/cxl/test/cxl.c | 10 +++--- 6 files changed, 94 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 566c387d4385..cb8c155a2c9b 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -529,7 +529,7 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - return acpi_get_genport_coordinates(uid, dport->hb_coord); + return acpi_get_genport_coordinates(uid, dport->coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index f66b493b5d3c..bb83867d9fec 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -14,7 +14,7 @@ struct dsmas_entry { struct range dpa_range; u8 handle; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int entries; int qos_class; @@ -88,8 +88,8 @@ static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, return 0; } -static void cxl_access_coordinate_set(struct access_coordinate *coord, - int access, unsigned int val) +static void __cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) { switch (access) { case ACPI_HMAT_ACCESS_LATENCY: @@ -115,6 +115,13 @@ static void cxl_access_coordinate_set(struct access_coordinate *coord, } } +static void cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_access_coordinate_set(&coord[i], access, val); +} + static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -156,7 +163,7 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), dslbis->data_type); - cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); + cxl_access_coordinate_set(dent->coord, dslbis->data_type, val); return 0; } @@ -190,13 +197,13 @@ static int cxl_cdat_endpoint_process(struct cxl_port *port, static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { - struct access_coordinate ep_c; + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; int rc; - rc = cxl_endpoint_get_perf_coordinates(port, &ep_c); + rc = cxl_endpoint_get_perf_coordinates(port, ep_c); if (rc) { dev_dbg(&port->dev, "Failed to retrieve ep perf coordinates.\n"); return rc; @@ -213,10 +220,11 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, xa_for_each(dsmas_xa, index, dent) { int qos_class; - cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); + cxl_coordinates_combine(dent->coord, dent->coord, ep_c); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, - &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, + &dent->coord[ACCESS_COORDINATE_CPU], + 1, &qos_class); if (rc != 1) continue; @@ -233,14 +241,17 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, struct cxl_dpa_perf *dpa_perf) { + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + dpa_perf->coord[i] = dent->coord[i]; dpa_perf->dpa_range = dent->dpa_range; - dpa_perf->coord = dent->coord; dpa_perf->qos_class = dent->qos_class; dev_dbg(dev, "DSMAS: dpa: %#llx qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", dent->dpa_range.start, dpa_perf->qos_class, - dent->coord.read_bandwidth, dent->coord.write_bandwidth, - dent->coord.read_latency, dent->coord.write_latency); + dent->coord[ACCESS_COORDINATE_CPU].read_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].write_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].read_latency, + dent->coord[ACCESS_COORDINATE_CPU].write_latency); } static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, @@ -477,10 +488,11 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || - dsp_id == dport->port_id) - cxl_access_coordinate_set(&dport->sw_coord, + dsp_id == dport->port_id) { + cxl_access_coordinate_set(dport->coord, sslbis->data_type, val); + } } } @@ -502,6 +514,21 @@ void cxl_switch_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, CXL); +static void __cxl_coordinates_combine(struct access_coordinate *out, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + if (c1->write_bandwidth && c2->write_bandwidth) + out->write_bandwidth = min(c1->write_bandwidth, + c2->write_bandwidth); + out->write_latency = c1->write_latency + c2->write_latency; + + if (c1->read_bandwidth && c2->read_bandwidth) + out->read_bandwidth = min(c1->read_bandwidth, + c2->read_bandwidth); + out->read_latency = c1->read_latency + c2->read_latency; +} + /** * cxl_coordinates_combine - Combine the two input coordinates * @@ -513,15 +540,8 @@ void cxl_coordinates_combine(struct access_coordinate *out, struct access_coordinate *c1, struct access_coordinate *c2) { - if (c1->write_bandwidth && c2->write_bandwidth) - out->write_bandwidth = min(c1->write_bandwidth, - c2->write_bandwidth); - out->write_latency = c1->write_latency + c2->write_latency; - - if (c1->read_bandwidth && c2->read_bandwidth) - out->read_bandwidth = min(c1->read_bandwidth, - c2->read_bandwidth); - out->read_latency = c1->read_latency + c2->read_latency; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_coordinates_combine(&out[i], &c1[i], &c2[i]); } MODULE_IMPORT_NS(CXL); @@ -558,12 +578,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - perf->coord.read_latency); + perf->coord[i].read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - perf->coord.write_latency); - cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; + perf->coord[i].write_latency); + cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c7c00eb373af..801c4018a1dd 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,6 +2133,29 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); +static void add_latency(struct access_coordinate *c, long latency) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_latency += latency; + c[i].read_latency += latency; + } +} + +static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_bandwidth = min(c[i].write_bandwidth, bw); + c[i].read_bandwidth = min(c[i].read_bandwidth, bw); + } +} + +static void set_access_coordinates(struct access_coordinate *out, + struct access_coordinate *in) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + out[i] = in[i]; +} + static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2149,9 +2172,15 @@ static bool parent_port_is_cxl_root(struct cxl_port *port) int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord) { - struct access_coordinate c = { - .read_bandwidth = UINT_MAX, - .write_bandwidth = UINT_MAX, + struct access_coordinate c[] = { + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, }; struct cxl_port *iter = port; struct cxl_dport *dport; @@ -2178,14 +2207,13 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * have CDAT and therefore needs to be skipped. */ if (!is_cxl_root) - cxl_coordinates_combine(&c, &c, &dport->sw_coord); - c.write_latency += dport->link_latency; - c.read_latency += dport->link_latency; + cxl_coordinates_combine(c, c, dport->coord); + add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ - cxl_coordinates_combine(&c, &c, dport->hb_coord); + cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); @@ -2194,10 +2222,8 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, return -ENXIO; bw /= BITS_PER_BYTE; - c.write_bandwidth = min(c.write_bandwidth, bw); - c.read_bandwidth = min(c.read_bandwidth, bw); - - *coord = c; + set_min_bandwidth(c, bw); + set_access_coordinates(coord, c); return 0; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ed02373ce3d9..036d17db68e0 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -663,8 +663,7 @@ struct cxl_rcrb_info { * @rch: Indicate whether this dport was enumerated in RCH or VH mode * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks - * @sw_coord: access coordinates (performance) for switch from CDAT - * @hb_coord: access coordinates (performance) from ACPI generic port (host bridge) + * @coord: access coordinates (bandwidth and latency performance attributes) * @link_latency: calculated PCIe downstream latency */ struct cxl_dport { @@ -675,8 +674,7 @@ struct cxl_dport { bool rch; struct cxl_port *port; struct cxl_regs regs; - struct access_coordinate sw_coord; - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; long link_latency; }; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 20fb3b35e89e..36cee9c30ceb 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -401,7 +401,7 @@ enum cxl_devtype { */ struct cxl_dpa_perf { struct range dpa_range; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int qos_class; }; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 908e0d083936..61c69297e797 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -986,10 +986,12 @@ static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range, { dpa_perf->qos_class = FAKE_QTG_ID; dpa_perf->dpa_range = *range; - dpa_perf->coord.read_latency = 500; - dpa_perf->coord.write_latency = 500; - dpa_perf->coord.read_bandwidth = 1000; - dpa_perf->coord.write_bandwidth = 1000; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + dpa_perf->coord[i].read_latency = 500; + dpa_perf->coord[i].write_latency = 500; + dpa_perf->coord[i].read_bandwidth = 1000; + dpa_perf->coord[i].write_bandwidth = 1000; + } } static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) -- cgit v1.2.3 From 08a828249b16f69248652937be7f1b8dab340a22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Mar 2024 17:36:41 -0800 Subject: KVM: selftests: Verify post-RESET value of PERF_GLOBAL_CTRL in PMCs test Add a guest assert in the PMU counters test to verify that KVM stuffs the vCPU's post-RESET value to globally enable all general purpose counters. Per Intel's SDM, IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. and Where "n" is the number of general-purpose counters available in the processor. For the edge case where there are zero GP counters, follow the spirit of the architecture, not the SDM's literal wording, which doesn't account for this possibility and would require the CPU to set _all_ bits in PERF_GLOBAL_CTRL. Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Link: https://lore.kernel.org/r/20240309013641.1413400-3-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86_64/pmu_counters_test.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 29609b52f8fa..26c85815f7e9 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -416,12 +416,30 @@ static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters static void guest_test_gp_counters(void) { + uint8_t pmu_version = guest_get_pmu_version(); uint8_t nr_gp_counters = 0; uint32_t base_msr; - if (guest_get_pmu_version()) + if (pmu_version) nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + /* + * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is + * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number + * of GP counters. If there are no GP counters, require KVM to leave + * PERF_GLOBAL_CTRL '0'. This edge case isn't covered by the SDM, but + * follow the spirit of the architecture and only globally enable GP + * counters, of which there are none. + */ + if (pmu_version > 1) { + uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL); + + if (nr_gp_counters) + GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0)); + else + GUEST_ASSERT_EQ(global_ctrl, 0); + } + if (this_cpu_has(X86_FEATURE_PDCM) && rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) base_msr = MSR_IA32_PMC0; -- cgit v1.2.3 From 0ef2dd1f4144bc024d3c98954fa061a102f6481b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Fri, 15 Mar 2024 10:35:07 -0400 Subject: KVM: selftests: fix max_guest_memory_test with more that 256 vCPUs max_guest_memory_test uses ucalls to sync with the host, but it also resets the guest RIP back to its initial value in between tests stages. This makes the guest never reach the code which frees the ucall struct and since a fixed pool of 512 ucall structs is used, the test starts to fail when more that 256 vCPUs are used. Fix that by replacing the manual register reset with a loop in the guest code. Signed-off-by: Maxim Levitsky Link: https://lore.kernel.org/r/20240315143507.102629-1-mlevitsk@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/max_guest_memory_test.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 6628dc4dda89..1a6da7389bf1 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -22,10 +22,11 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) { uint64_t gpa; - for (gpa = start_gpa; gpa < end_gpa; gpa += stride) - *((volatile uint64_t *)gpa) = gpa; - - GUEST_DONE(); + for (;;) { + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) + *((volatile uint64_t *)gpa) = gpa; + GUEST_SYNC(0); + } } struct vcpu_info { @@ -55,7 +56,7 @@ static void rendezvous_with_boss(void) static void run_vcpu(struct kvm_vcpu *vcpu) { vcpu_run(vcpu); - TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); } static void *vcpu_worker(void *data) @@ -64,17 +65,13 @@ static void *vcpu_worker(void *data) struct kvm_vcpu *vcpu = info->vcpu; struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; - struct kvm_regs regs; vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); - /* Snapshot regs before the first run. */ - vcpu_regs_get(vcpu, ®s); rendezvous_with_boss(); run_vcpu(vcpu); rendezvous_with_boss(); - vcpu_regs_set(vcpu, ®s); vcpu_sregs_get(vcpu, &sregs); #ifdef __x86_64__ /* Toggle CR0.WP to trigger a MMU context reset. */ -- cgit v1.2.3 From 449c0811d8729068646e1a270be72bf2da2e66f3 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 3 Apr 2024 14:33:01 +0200 Subject: KVM: selftests: fix supported_flags for riscv commit 849c1816436f ("KVM: selftests: fix supported_flags for aarch64") fixed the set-memory-region test for aarch64 by declaring the read-only flag is supported. riscv also supports the read-only flag. Fix it too. Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20240403123300.63923-2-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/set_memory_region_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 06b43ed23580..bd57d991e27d 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -333,7 +333,7 @@ static void test_invalid_memory_region_flags(void) struct kvm_vm *vm; int r, i; -#if defined __aarch64__ || defined __x86_64__ +#if defined __aarch64__ || defined __riscv || defined __x86_64__ supported_flags |= KVM_MEM_READONLY; #endif -- cgit v1.2.3 From 6d029c25b71f2de2838a6f093ce0fa0e69336154 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 9 Apr 2024 15:38:03 +0200 Subject: selftests/timers/posix_timers: Reimplement check_timer_distribution() check_timer_distribution() runs ten threads in a busy loop and tries to test that the kernel distributes a process posix CPU timer signal to every thread over time. There is not guarantee that this is true even after commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") because that commit only avoids waking up the sleeping process leader thread, but that has nothing to do with the actual signal delivery. As the signal is process wide the first thread which observes sigpending and wins the race to lock sighand will deliver the signal. Testing shows that this hangs on a regular base because some threads never win the race. The comment "This primarily tests that the kernel does not favour any one." is wrong. The kernel does favour a thread which hits the timer interrupt when CLOCK_PROCESS_CPUTIME_ID expires. Rewrite the test so it only checks that the group leader sleeping in join() never receives SIGALRM and the thread which burns CPU cycles receives all signals. In older kernels which do not have commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") the test-case fails immediately, the very 1st tick wakes the leader up. Otherwise it quickly succeeds after 100 ticks. CI testing wants to use newer selftest versions on stable kernels. In this case the test is guaranteed to fail. So check in the failure case whether the kernel version is less than v6.3 and skip the test result in that case. [ tglx: Massaged change log, renamed the version check helper ] Fixes: e797203fb3ba ("selftests/timers/posix_timers: Test delivery of signals across threads") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240409133802.GD29396@redhat.com --- tools/testing/selftests/kselftest.h | 13 ++++ tools/testing/selftests/timers/posix_timers.c | 103 ++++++++++++-------------- 2 files changed, 60 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..973b18e156b2 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -51,6 +51,7 @@ #include #include #include +#include #endif #ifndef ARRAY_SIZE @@ -388,4 +389,16 @@ static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) exit(KSFT_SKIP); } +static inline int ksft_min_kernel_version(unsigned int min_major, + unsigned int min_minor) +{ + unsigned int major, minor; + struct utsname info; + + if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) + ksft_exit_fail_msg("Can't parse kernel version\n"); + + return major > min_major || (major == min_major && minor >= min_minor); +} + #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d49dd3ffd0d9..d86a0e00711e 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -184,80 +184,71 @@ static int check_timer_create(int which) return 0; } -int remain; -__thread int got_signal; +static pthread_t ctd_thread; +static volatile int ctd_count, ctd_failed; -static void *distribution_thread(void *arg) +static void ctd_sighandler(int sig) { - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); - return NULL; + if (pthread_self() != ctd_thread) + ctd_failed = 1; + ctd_count--; } -static void distribution_handler(int nr) +static void *ctd_thread_func(void *arg) { - if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) - __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); -} - -/* - * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID - * timer signals. This primarily tests that the kernel does not favour any one. - */ -static int check_timer_distribution(void) -{ - int err, i; - timer_t id; - const int nthreads = 10; - pthread_t threads[nthreads]; struct itimerspec val = { .it_value.tv_sec = 0, .it_value.tv_nsec = 1000 * 1000, .it_interval.tv_sec = 0, .it_interval.tv_nsec = 1000 * 1000, }; + timer_t id; - remain = nthreads + 1; /* worker threads + this thread */ - signal(SIGALRM, distribution_handler); - err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + /* 1/10 seconds to ensure the leader sleeps */ + usleep(10000); - for (i = 0; i < nthreads; i++) { - err = pthread_create(&threads[i], NULL, distribution_thread, - NULL); - if (err) { - ksft_print_msg("Can't create thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + ctd_count = 100; + if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) + return "Can't create timer\n"; + if (timer_settime(id, 0, &val, NULL)) + return "Can't set timer\n"; - /* Wait for all threads to receive the signal. */ - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + while (ctd_count > 0 && !ctd_failed) + ; - for (i = 0; i < nthreads; i++) { - err = pthread_join(threads[i], NULL); - if (err) { - ksft_print_msg("Can't join thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + if (timer_delete(id)) + return "Can't delete timer\n"; - if (timer_delete(id)) { - ksft_perror("Can't delete timer"); - return -1; - } + return NULL; +} + +/* + * Test that only the running thread receives the timer signal. + */ +static int check_timer_distribution(void) +{ + const char *errmsg; - ksft_test_result_pass("check_timer_distribution\n"); + signal(SIGALRM, ctd_sighandler); + + errmsg = "Can't create thread\n"; + if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) + goto err; + + errmsg = "Can't join thread\n"; + if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) + goto err; + + if (!ctd_failed) + ksft_test_result_pass("check signal distribution\n"); + else if (ksft_min_kernel_version(6, 3)) + ksft_test_result_fail("check signal distribution\n"); + else + ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; +err: + ksft_print_msg(errmsg); + return -1; } int main(int argc, char **argv) -- cgit v1.2.3 From 05a2f07db8883b027c0b4a475fcc586278922b8d Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 5 Mar 2024 12:27:27 +0100 Subject: tools/power turbostat: read RAPL counters via perf Some of the future Intel platforms will require reading the RAPL counters via perf and not MSR. On current platforms we can still read them using both ways. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 786 ++++++++++++++++++++++++++++------ 1 file changed, 649 insertions(+), 137 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a380829c5890..283dffb987b5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -37,6 +37,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -60,6 +61,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; +enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -958,6 +960,175 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 +/* Indexes used to map data read from perf and MSRs into global variables */ +enum rapl_rci_index { + RAPL_RCI_INDEX_ENERGY_PKG = 0, + RAPL_RCI_INDEX_ENERGY_CORES = 1, + RAPL_RCI_INDEX_DRAM = 2, + RAPL_RCI_INDEX_GFX = 3, + RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, + RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, + RAPL_RCI_INDEX_CORE_ENERGY = 6, + NUM_RAPL_COUNTERS, +}; + +enum rapl_unit { + RAPL_UNIT_INVALID, + RAPL_UNIT_JOULES, + RAPL_UNIT_WATTS, +}; + +struct rapl_counter_info_t { + unsigned long long data[NUM_RAPL_COUNTERS]; + enum rapl_source source[NUM_RAPL_COUNTERS]; + unsigned long long flags[NUM_RAPL_COUNTERS]; + double scale[NUM_RAPL_COUNTERS]; + enum rapl_unit unit[NUM_RAPL_COUNTERS]; + + union { + /* Active when source == RAPL_SOURCE_MSR */ + struct { + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; + }; + }; + + int fd_perf; +}; + +/* struct rapl_counter_info_t for each RAPL domain */ +struct rapl_counter_info_t *rapl_counter_info_perdomain; + +#define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) + +struct rapl_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + int msr_shift; /* Positive mean shift right, negative mean shift left */ + double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */ + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */ + unsigned long long flags; +}; + +static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { + { + .feature_mask = RAPL_PKG, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STAT, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_CORE_ENERGY_STATUS, + .perf_subsys = "power", + .perf_name = "energy-cores", + .msr = MSR_PP0_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM, + .perf_subsys = "power", + .perf_name = "energy-ram", + .msr = MSR_DRAM_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_dram_energy_units, + .rci_index = RAPL_RCI_INDEX_DRAM, + .bic = BIC_RAMWatt | BIC_RAM_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_GFX, + .perf_subsys = "power", + .perf_name = "energy-gpu", + .msr = MSR_PP1_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_GFX, + .bic = BIC_GFXWatt | BIC_GFX_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PKG_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_PKG_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS, + .bic = BIC_PKG__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_DRAM_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS, + .bic = BIC_RAM__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_CORE_ENERGY_STAT, + .msr_mask = 0xFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = 0, + }, +}; + +struct rapl_counter { + unsigned long long raw_value; + enum rapl_unit unit; + double scale; +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -984,7 +1155,7 @@ struct core_data { unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; - unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ + struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1009,12 +1180,12 @@ struct pkg_data { unsigned int gfx_mhz; unsigned int gfx_act_mhz; unsigned int package_id; - unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + struct rapl_counter energy_cores; /* MSR_PP0_ENERGY_STATUS */ + struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + struct rapl_counter rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1403,6 +1574,21 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } +int probe_msr(int cpu, off_t offset) +{ + ssize_t retval; + unsigned long long dummy; + + assert(!no_msr); + + retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset); + + if (retval != sizeof(dummy)) + return 1; + + return 0; +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1755,7 +1941,11 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "c7: %016llX\n", c->c7); outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c); outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt); - outp += sprintf(outp, "Joules: %0X\n", c->core_energy); + + const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; + const double energy_scale = c->core_energy.scale; + if (c->core_energy.unit == RAPL_UNIT_JOULES) + outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += @@ -1785,12 +1975,12 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { @@ -1805,6 +1995,23 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; } +double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval) +{ + assert(desired_unit != RAPL_UNIT_INVALID); + + /* + * For now we don't expect anything other than joules, + * so just simplify the logic. + */ + assert(c->unit == RAPL_UNIT_JOULES); + + const double scaled = c->raw_value * c->scale; + + if (desired_unit == RAPL_UNIT_WATTS) + return scaled / interval; + return scaled; +} + /* * column formatting convention & formats */ @@ -1998,9 +2205,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); /* print per-package data only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) @@ -2072,34 +2281,40 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_PkgWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_GFXWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAMWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - p->energy_dram * rapl_dram_energy_units / interval_float); + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Pkg_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_GFX_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_RAM_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); @@ -2208,12 +2423,13 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; - old->energy_pkg = new->energy_pkg - old->energy_pkg; - old->energy_cores = new->energy_cores - old->energy_cores; - old->energy_gfx = new->energy_gfx - old->energy_gfx; - old->energy_dram = new->energy_dram - old->energy_dram; - old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; - old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; + old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; + old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; + old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value; + old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value; + old->rapl_dram_perf_status.raw_value = + new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2237,7 +2453,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->core_throt_cnt = new->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; - DELTA_WRAP32(new->core_energy, old->core_energy); + DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2364,6 +2580,13 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void rapl_counter_clear(struct rapl_counter *c) +{ + c->raw_value = 0; + c->scale = 0.0; + c->unit = RAPL_UNIT_INVALID; +} + void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2391,7 +2614,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c7 = 0; c->mc6_us = 0; c->core_temp_c = 0; - c->core_energy = 0; + rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; p->pkg_wtd_core_c0 = 0; @@ -2412,12 +2635,12 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->cpu_lpi = 0; p->sys_lpi = 0; - p->energy_pkg = 0; - p->energy_dram = 0; - p->energy_cores = 0; - p->energy_gfx = 0; - p->rapl_pkg_perf_status = 0; - p->rapl_dram_perf_status = 0; + rapl_counter_clear(&p->energy_pkg); + rapl_counter_clear(&p->energy_dram); + rapl_counter_clear(&p->energy_cores); + rapl_counter_clear(&p->energy_gfx); + rapl_counter_clear(&p->rapl_pkg_perf_status); + rapl_counter_clear(&p->rapl_dram_perf_status); p->pkg_temp_c = 0; p->gfx_rc6_ms = 0; @@ -2434,6 +2657,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->counter[i] = 0; } +void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) +{ + /* Copy unit and scale from src if dst is not initialized */ + if (dst->unit == RAPL_UNIT_INVALID) { + dst->unit = src->unit; + dst->scale = src->scale; + } + + assert(dst->unit == src->unit); + assert(dst->scale == src->scale); + + dst->raw_value += src->raw_value; +} + int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2480,7 +2717,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); - average.cores.core_energy += c->core_energy; + rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2515,10 +2752,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; - average.packages.energy_pkg += p->energy_pkg; - average.packages.energy_dram += p->energy_dram; - average.packages.energy_cores += p->energy_cores; - average.packages.energy_gfx += p->energy_gfx; + rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg); + rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram); + rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores); + rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx); average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.uncore_mhz = p->uncore_mhz; @@ -2527,8 +2764,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); - average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status; - average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; + rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status); + rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) @@ -2797,25 +3034,53 @@ struct amperf_group_fd { int mperf; }; -static unsigned int read_perf_counter_info(const char *const path, const char *const parse_format) +static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr) { int fdmt; - char buf[16]; - unsigned int v; + int bytes_read; + char buf[64]; + int ret = -1; fdmt = open(path, O_RDONLY, 0); - if (fdmt == -1) - errx(1, "Failed to read perf counter info %s\n", path); + if (fdmt == -1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } - if (read(fdmt, buf, sizeof(buf)) <= 0) - return 0; + bytes_read = read(fdmt, buf, sizeof(buf) - 1); + if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } - buf[sizeof(buf) - 1] = '\0'; + buf[bytes_read] = '\0'; - if (sscanf(buf, parse_format, &v) != 1) - errx(1, "Failed to parse perf counter info %s\n", path); + if (sscanf(buf, parse_format, value_ptr) != 1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + ret = 0; + +cleanup_and_exit: close(fdmt); + return ret; +} + +static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format) +{ + unsigned int v; + int status; + + status = read_perf_counter_info(path, parse_format, &v); + if (status) + v = -1; return v; } @@ -2825,7 +3090,7 @@ static unsigned read_msr_type(void) const char *const path = "/sys/bus/event_source/devices/msr/type"; const char *const format = "%u"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); } static unsigned read_aperf_config(void) @@ -2833,7 +3098,7 @@ static unsigned read_aperf_config(void) const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; const char *const format = "event=%x"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); } static unsigned read_mperf_config(void) @@ -2841,7 +3106,60 @@ static unsigned read_mperf_config(void) const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; const char *const format = "event=%x"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); +} + +static unsigned read_perf_type(const char *subsys) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/type"; + const char *const format = "%u"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys); + + return read_perf_counter_info_n(path, format); +} + +static unsigned read_rapl_config(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; + const char *const format = "event=%x"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + return read_perf_counter_info_n(path, format); +} + +static unsigned read_perf_rapl_unit(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; + const char *const format = "%s"; + char path[128]; + char unit_buffer[16]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + read_perf_counter_info(path, format, &unit_buffer); + if (strcmp("Joules", unit_buffer) == 0) + return RAPL_UNIT_JOULES; + + return RAPL_UNIT_INVALID; +} + +static double read_perf_rapl_scale(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; + const char *const format = "%lf"; + char path[128]; + double scale; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + if (read_perf_counter_info(path, format, &scale)) + return 0.0; + + return scale; } static struct amperf_group_fd open_amperf_fd(int cpu) @@ -2961,6 +3279,99 @@ retry: return 0; } +size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) + if (rci->source[i] == RAPL_SOURCE_PERF) + ++ret; + + return ret; +} + +void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) +{ + rc->raw_value = rci->data[idx]; + rc->unit = rci->unit[idx]; + rc->scale = rci->scale[idx]; +} + +int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +{ + unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + + if (debug) + fprintf(stderr, "get_rapl_counters: cpu%d domain%d\n", cpu, domain); + + assert(rapl_counter_info_perdomain); + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (rci->fd_perf != -1) { + size_t num_perf_counters = rapl_counter_info_count_perf(rci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + if (actual_read_size != expected_read_size) + err(-1, "get_rapl_counters: failed to read perf_data (%zu %zu)", expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { + switch (rci->source[i]) { + case RAPL_SOURCE_NONE: + break; + + case RAPL_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(rci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", + i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); + + rci->data[i] = perf_data[pi]; + + ++pi; + break; + + case RAPL_SOURCE_MSR: + if (debug) + fprintf(stderr, "Reading rapl counter via msr at %u\n", i); + + assert(!no_msr); + if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) { + if (get_msr_sum(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } else { + if (get_msr(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } + + rci->data[i] &= rci->msr_mask[i]; + if (rci->msr_shift[i] >= 0) + rci->data[i] >>= abs(rci->msr_shift[i]); + else + rci->data[i] <<= abs(rci->msr_shift[i]); + + break; + } + } + + _Static_assert(NUM_RAPL_COUNTERS == 7); + write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); + write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); + write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); + write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX); + write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); + write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); + write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2972,6 +3383,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) unsigned long long msr; struct msr_counter *mp; int i; + int status; if (cpu_migrate(cpu)) { fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); @@ -3029,6 +3441,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (!is_cpu_first_thread_in_core(t, c, p)) goto done; + if (platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, c->core_id, c, p); + if (status != 0) + return status; + } + if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) return -6; @@ -3069,12 +3487,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) { - if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) - return -14; - c->core_energy = msr & 0xFFFFFFFF; - } - for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &c->counter[i])) return -10; @@ -3134,42 +3546,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (!no_msr) { - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; - } + if (!platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, p->package_id, c, p); + if (status != 0) + return status; } if (DO_BIC(BIC_PkgTmp)) { @@ -3714,6 +4094,21 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_rapl_percpu(void) +{ + if (!rapl_counter_info_perdomain) + return; + + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) + close(rapl_counter_info_perdomain[domain_id].fd_perf); + } + + free(rapl_counter_info_perdomain); +} + void free_all_buffers(void) { int i; @@ -3757,6 +4152,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); free_fd_amperf_percpu(); + free_fd_rapl_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4091,12 +4487,14 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void rapl_perf_init(void); void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + rapl_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5315,31 +5713,18 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; - if (rapl_joules) { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_Pkg_J); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_Cor_J); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAM_J); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFX_J); - } else { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_PkgWatt); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_CorWatt); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAMWatt); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFXWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) - BIC_PRESENT(BIC_PKG__); - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) - BIC_PRESENT(BIC_RAM__); + if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) + bic_enabled &= ~BIC_PKG__; + if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) + bic_enabled &= ~BIC_RAM__; /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) @@ -5373,14 +5758,13 @@ void rapl_probe_amd(void) { unsigned long long msr; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) return; @@ -5885,6 +6269,48 @@ bool is_aperf_access_required(void) || BIC_IS_ENABLED(BIC_IPC); } +int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale_, enum rapl_unit *unit_) +{ + if (no_perf) + return -1; + + const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + if (scale == 0.0) + return -1; + + const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + if (unit == RAPL_UNIT_INVALID) + return -1; + + const unsigned rapl_type = read_perf_type(cai->perf_subsys); + const unsigned rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (rci->fd_perf == -1) + rci->fd_perf = fd_counter; + + *scale_ = scale; + *unit_ = unit; + return fd_counter; +} + +int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale, enum rapl_unit *unit) +{ + int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); + + if (debug) + fprintf(stderr, "add_rapl_perf_counter: %d (cpu: %d)\n", ret, cpu); + + return ret; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5908,17 +6334,101 @@ void linux_perf_init(void) } } -static int has_amperf_access_via_msr(void) +void rapl_perf_init(void) { - unsigned long long dummy; + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + bool *domain_visited = calloc(num_domains, sizeof(bool)); + + rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); + if (rapl_counter_info_perdomain == NULL) + err(-1, "calloc rapl_counter_info_percpu"); + /* + * Initialize rapl_counter_info_percpu + */ + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + rci->fd_perf = -1; + for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { + rci->data[i] = 0; + rci->source[i] = RAPL_SOURCE_NONE; + } + } + + /* + * Open/probe the counters + * If can't get it via perf, fallback to MSR + */ + for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) { + + const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i]; + bool has_counter = 0; + double scale; + enum rapl_unit unit; + int next_domain; + + memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + /* Skip already seen and handled RAPL domains */ + next_domain = + platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + + if (domain_visited[next_domain]) + continue; + + domain_visited[next_domain] = 1; + + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; + + /* Check if the counter is enabled and accessible */ + if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) { + + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { + rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->scale[cai->rci_index] = scale * cai->compat_scale; + rci->unit[cai->rci_index] = unit; + rci->flags[cai->rci_index] = cai->flags; + + /* Use MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->msr[cai->rci_index] = cai->msr; + rci->msr_mask[cai->rci_index] = cai->msr_mask; + rci->msr_shift[cai->rci_index] = cai->msr_shift; + rci->unit[cai->rci_index] = RAPL_UNIT_JOULES; + rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale; + rci->flags[cai->rci_index] = cai->flags; + } + } + + if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + has_counter = 1; + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(domain_visited); +} + +static int has_amperf_access_via_msr(void) +{ if (no_msr) return 0; - if (get_msr(base_cpu, MSR_IA32_APERF, &dummy)) + if (probe_msr(base_cpu, MSR_IA32_APERF)) return 0; - if (get_msr(base_cpu, MSR_IA32_MPERF, &dummy)) + if (probe_msr(base_cpu, MSR_IA32_MPERF)) return 0; return 1; @@ -6696,6 +7206,7 @@ bool is_msr_access_required(void) || BIC_IS_ENABLED(BIC_Pkgpc8) || BIC_IS_ENABLED(BIC_Pkgpc9) || BIC_IS_ENABLED(BIC_Pkgpc10) + /* TODO: Multiplex access with perf */ || BIC_IS_ENABLED(BIC_CorWatt) || BIC_IS_ENABLED(BIC_Cor_J) || BIC_IS_ENABLED(BIC_PkgWatt) @@ -6749,6 +7260,7 @@ void turbostat_init() probe_pm_features(); set_amperf_source(); linux_perf_init(); + rapl_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); -- cgit v1.2.3 From 17d1ea136be86f53be0461b0c33daf6b58e6cbf7 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 7 Mar 2024 17:00:35 +0100 Subject: tools/power turbostat: Add selftests Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/testing/selftests/turbostat/defcolumns.py | 59 +++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100755 tools/testing/selftests/turbostat/defcolumns.py (limited to 'tools') diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py new file mode 100755 index 000000000000..70d3b7780311 --- /dev/null +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -0,0 +1,59 @@ +#!/bin/env python3 + +import subprocess +from shutil import which + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +# +# By default --list reports also "usec" and "Time_Of_Day_Seconds" columns +# which are only visible when running with --debug. +# +expected_columns_debug = proc_turbostat.stdout.replace(b',', b'\t').strip() +expected_columns = expected_columns_debug.replace(b'usec\t', b'').replace(b'Time_Of_Day_Seconds\t', b'').replace(b'X2APIC\t', b'').replace(b'APIC\t', b'') + +# +# Run turbostat with no options for 10 seconds and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '1s'] +turbostat_argv = [turbostat, '-i', '0.250'] + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) +print('OK') + +# +# Same, but with --debug +# +turbostat_argv.append('--debug') + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) +print('OK') -- cgit v1.2.3 From bb5db22c13125b38b0740e19c18ae94f8e5a0eb6 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 11:19:15 +0800 Subject: tools/power/turbostat: Enable MSR_CORE_C1_RES support for ICX Enable Core C1 hardware residency counter (MSR_CORE_C1_RES) on ICX. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 283dffb987b5..372f67a70d8a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -664,6 +664,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, -- cgit v1.2.3 From 4e2bbbf78cf7144204214fd0bd7cca309acd8f89 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 14:23:37 +0800 Subject: tools/power/turbostat: Cache graphics sysfs path Graphics drivers (i915/Xe) have different sysfs knobs on different platforms, and it is possible that different sysfs knobs fit into the same turbostat columns. Instead of specifying different sysfs knobs every time, detect them once and cache the path for future use. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 45 +++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 372f67a70d8a..78db4f65a237 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -276,6 +276,19 @@ bool no_msr; bool no_perf; enum amperf_source amperf_source; +enum gfx_sysfs_idx { + GFX_rc6, + GFX_MHz, + GFX_ACTMHz, + GFX_MAX +}; + +struct gfx_sysfs_info { + const char *path; +}; + +static struct gfx_sysfs_info gfx_info[GFX_MAX]; + int get_msr(int cpu, off_t offset, unsigned long long *msr); /* Model specific support Start */ @@ -4620,7 +4633,7 @@ int snapshot_gfx_rc6_ms(void) FILE *fp; int retval; - fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r"); + fp = fopen_or_die(gfx_info[GFX_rc6].path, "r"); retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); if (retval != 1) @@ -4645,9 +4658,7 @@ int snapshot_gfx_mhz(void) int retval; if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); + fp = fopen_or_die(gfx_info[GFX_MHz].path, "r"); } else { rewind(fp); fflush(fp); @@ -4674,9 +4685,7 @@ int snapshot_gfx_act_mhz(void) int retval; if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + fp = fopen_or_die(gfx_info[GFX_ACTMHz].path, "r"); } else { rewind(fp); fflush(fp); @@ -5338,14 +5347,24 @@ probe_cluster: static void probe_graphics(void) { if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; + + + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (gfx_info[GFX_rc6].path) + BIC_PRESENT(BIC_GFX_rc6); + if (gfx_info[GFX_MHz].path) + BIC_PRESENT(BIC_GFXMHz); + if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); } -- cgit v1.2.3 From de39d38c06eb047954c5ad20a3f9acb6d3c78498 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Mar 2024 10:12:19 +0800 Subject: tools/power/turbostat: Unify graphics sysfs snapshots Graphics sysfs snapshots share similar logic. Combine them into one function to avoid code duplication. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 109 +++++++++++----------------------- 1 file changed, 34 insertions(+), 75 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 78db4f65a237..e5b6161fef48 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -251,11 +251,8 @@ char *output_buffer, *outp; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; -unsigned int gfx_cur_mhz; -unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; double rapl_power_units, rapl_time_units; @@ -285,6 +282,9 @@ enum gfx_sysfs_idx { struct gfx_sysfs_info { const char *path; + FILE *fp; + unsigned int val; + unsigned long long val_ull; }; static struct gfx_sysfs_info gfx_info[GFX_MAX]; @@ -3573,17 +3573,17 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_cur_rc6_ms; + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); if (DO_BIC(BIC_GFXMHz)) - p->gfx_mhz = gfx_cur_mhz; + p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) - p->gfx_act_mhz = gfx_act_mhz; + p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) @@ -4621,81 +4621,40 @@ int snapshot_proc_interrupts(void) } /* - * snapshot_gfx_rc6_ms() + * snapshot_graphics() * - * record snapshot of - * /sys/class/drm/card0/power/rc6_residency_ms + * record snapshot of specified graphics sysfs knob * * return 1 if config change requires a restart, else return 0 */ -int snapshot_gfx_rc6_ms(void) +int snapshot_graphics(int idx) { FILE *fp; int retval; - fp = fopen_or_die(gfx_info[GFX_rc6].path, "r"); - - retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); - if (retval != 1) - err(1, "GFX rc6"); - - fclose(fp); - - return 0; -} - -/* - * snapshot_gfx_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz - * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen_or_die(gfx_info[GFX_MHz].path, "r"); - } else { - rewind(fp); - fflush(fp); - } - - retval = fscanf(fp, "%d", &gfx_cur_mhz); - if (retval != 1) - err(1, "GFX MHz"); - - return 0; -} - -/* - * snapshot_gfx_cur_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz - * when /sys/class/drm/card0/gt_act_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_act_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen_or_die(gfx_info[GFX_ACTMHz].path, "r"); - } else { - rewind(fp); - fflush(fp); + switch (idx) { + case GFX_rc6: + fp = fopen_or_die(gfx_info[idx].path, "r"); + retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + if (retval != 1) + err(1, "rc6"); + fclose(fp); + return 0; + case GFX_MHz: + case GFX_ACTMHz: + if (gfx_info[idx].fp == NULL) { + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + } else { + rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); + } + retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); + if (retval != 1) + err(1, "MHz"); + return 0; + default: + return -EINVAL; } - - retval = fscanf(fp, "%d", &gfx_act_mhz); - if (retval != 1) - err(1, "GFX ACT MHz"); - - return 0; } /* @@ -4760,13 +4719,13 @@ int snapshot_proc_sysfs_files(void) return 1; if (DO_BIC(BIC_GFX_rc6)) - snapshot_gfx_rc6_ms(); + snapshot_graphics(GFX_rc6); if (DO_BIC(BIC_GFXMHz)) - snapshot_gfx_mhz(); + snapshot_graphics(GFX_MHz); if (DO_BIC(BIC_GFXACTMHz)) - snapshot_gfx_act_mhz(); + snapshot_graphics(GFX_ACTMHz); if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); -- cgit v1.2.3 From 60add818ab2543b7e4f2bfeaacf2504743c1eb50 Mon Sep 17 00:00:00 2001 From: Justin Ernst Date: Tue, 2 Apr 2024 13:40:29 -0400 Subject: tools/power/turbostat: Fix uncore frequency file string Running turbostat on a 16 socket HPE Scale-up Compute 3200 (SapphireRapids) fails with: turbostat: /sys/devices/system/cpu/intel_uncore_frequency/package_010_die_00/current_freq_khz: open failed: No such file or directory We observe the sysfs uncore frequency directories named: ... package_09_die_00/ package_10_die_00/ package_11_die_00/ ... package_15_die_00/ The culprit is an incorrect sprintf format string "package_0%d_die_0%d" used with each instance of reading uncore frequency files. uncore-frequency-common.c creates the sysfs directory with the format "package_%02d_die_%02d". Once the package value reaches double digits, the formats diverge. Change each instance of "package_0%d_die_0%d" to "package_%02d_die_%02d". [lenb: deleted the probe part of this patch, as it was already fixed] Signed-off-by: Justin Ernst Reviewed-by: Thomas Renninger Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5b6161fef48..016a5c7dc9bf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2939,7 +2939,7 @@ unsigned long long get_uncore_mhz(int package, int die) { char path[128]; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package, + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, die); return (snapshot_sysfs_counter(path) / 1000); -- cgit v1.2.3 From 3bbb331c1d34fdd5520a050fce35f71579430485 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Mar 2024 10:30:04 +0800 Subject: tools/power/turbostat: Introduce BIC_SAM_mc6/BIC_SAMMHz/BIC_SAMACTMHz Graphics driver (i915/Xe) on mordern platforms splits GFX and SA Media information via different sysfs knobs. Existing BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz columns can be reused for GFX. Introduce BIC_SAM_mc6/BIC_SAMMHz/BIC_SAMACTMHz columns for SA Media. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.8 | 12 ++++- tools/power/x86/turbostat/turbostat.c | 91 +++++++++++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 567327b004e6..0d3672e5d9ed 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -129,9 +129,17 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. .PP -\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms. +\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. .PP -\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz. +\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBGFXAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. +.PP +\fBSAM%mc6\fP The percentage of time the SA Media is in the "module C6" state, mc6, during the measurement interval. From /sys/class/drm/card0/gt/gt1/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. +.PP +\fBSAMMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. .PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 016a5c7dc9bf..4fa2810da1a3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -133,6 +133,9 @@ struct msr_counter bic[] = { { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -191,11 +194,14 @@ struct msr_counter bic[] = { #define BIC_IPC (1ULL << 52) #define BIC_CORE_THROT_CNT (1ULL << 53) #define BIC_UNCORE_MHZ (1ULL << 54) +#define BIC_SAM_mc6 (1ULL << 55) +#define BIC_SAMMHz (1ULL << 56) +#define BIC_SAMACTMHz (1ULL << 57) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -277,6 +283,9 @@ enum gfx_sysfs_idx { GFX_rc6, GFX_MHz, GFX_ACTMHz, + SAM_mc6, + SAM_MHz, + SAM_ACTMHz, GFX_MAX }; @@ -1193,6 +1202,9 @@ struct pkg_data { long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int gfx_act_mhz; + long long sam_mc6_ms; + unsigned int sam_mhz; + unsigned int sam_act_mhz; unsigned int package_id; struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ @@ -1844,6 +1856,15 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_SAM_mc6)) + outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -2251,6 +2272,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* SAMmc6 */ + if (DO_BIC(BIC_SAM_mc6)) { + if (p->sam_mc6_ms == -1) { /* detect GFX counter reset */ + outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); + } else { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + p->sam_mc6_ms / 10.0 / interval_float); + } + } + + /* SAMMHz */ + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz); + + /* SAMACTMHz */ + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); @@ -2437,6 +2476,15 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; + /* flag an error when mc6 counter resets/wraps */ + if (old->sam_mc6_ms > new->sam_mc6_ms) + old->sam_mc6_ms = -1; + else + old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms; + + old->sam_mhz = new->sam_mhz; + old->sam_act_mhz = new->sam_act_mhz; + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; @@ -2661,6 +2709,9 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->uncore_mhz = 0; p->gfx_mhz = 0; p->gfx_act_mhz = 0; + p->sam_mc6_ms = 0; + p->sam_mhz = 0; + p->sam_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -2775,6 +2826,9 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.uncore_mhz = p->uncore_mhz; average.packages.gfx_mhz = p->gfx_mhz; average.packages.gfx_act_mhz = p->gfx_act_mhz; + average.packages.sam_mc6_ms = p->sam_mc6_ms; + average.packages.sam_mhz = p->sam_mhz; + average.packages.sam_act_mhz = p->sam_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); @@ -3572,19 +3626,28 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + if (DO_BIC(BIC_GFX_rc6)) + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; + if (DO_BIC(BIC_GFXMHz)) p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; + if (DO_BIC(BIC_SAM_mc6)) + p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull; + + if (DO_BIC(BIC_SAMMHz)) + p->sam_mhz = gfx_info[SAM_MHz].val; + + if (DO_BIC(BIC_SAMACTMHz)) + p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; + for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) return -10; @@ -4634,6 +4697,7 @@ int snapshot_graphics(int idx) switch (idx) { case GFX_rc6: + case SAM_mc6: fp = fopen_or_die(gfx_info[idx].path, "r"); retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); if (retval != 1) @@ -4642,6 +4706,8 @@ int snapshot_graphics(int idx) return 0; case GFX_MHz: case GFX_ACTMHz: + case SAM_MHz: + case SAM_ACTMHz: if (gfx_info[idx].fp == NULL) { gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); } else { @@ -4727,6 +4793,15 @@ int snapshot_proc_sysfs_files(void) if (DO_BIC(BIC_GFXACTMHz)) snapshot_graphics(GFX_ACTMHz); + if (DO_BIC(BIC_SAM_mc6)) + snapshot_graphics(SAM_mc6); + + if (DO_BIC(BIC_SAMMHz)) + snapshot_graphics(SAM_MHz); + + if (DO_BIC(BIC_SAMACTMHz)) + snapshot_graphics(SAM_ACTMHz); + if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -5325,6 +5400,12 @@ static void probe_graphics(void) BIC_PRESENT(BIC_GFXMHz); if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); + if (gfx_info[SAM_mc6].path) + BIC_PRESENT(BIC_SAM_mc6); + if (gfx_info[SAM_MHz].path) + BIC_PRESENT(BIC_SAMMHz); + if (gfx_info[SAM_ACTMHz].path) + BIC_PRESENT(BIC_SAMACTMHz); } static void dump_sysfs_cstate_config(void) -- cgit v1.2.3 From dc02dc937a3ef819c5da10e97084af6977be26bf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 22 Mar 2024 09:52:24 +0800 Subject: tools/power/turbostat: Add support for new i915 sysfs knobs On Meteorlake platform, i915 driver supports the traditional graphics sysfs knobs including /sys/class/drm/card0/power/rc6_residency_ms /sys/class/drm/card0/gt_cur_freq_mhz /sys/class/drm/card0/gt_act_freq_mhz At the same time, it also supports /sys/class/drm/card0/gt/gt0/rc6_residency_ms /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz /sys/class/drm/card0/gt/gt1/rc6_residency_ms /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz gt0 is for GFX and gt1 is for SA Media. Enhance turbostat to prefer the i915 new sysfs knobs. Export gt0 via BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz. Export gt1 via BIC_SMA_mc6/BIC_SMAMHz/BIC_SMAACTMHz. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4fa2810da1a3..feca7f4cb5cd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5380,6 +5380,29 @@ probe_cluster: static void probe_graphics(void) { + /* New i915 graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) + gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) + gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) + gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + + goto end; + } + + /* Fall back to traditional i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; @@ -5394,6 +5417,7 @@ static void probe_graphics(void) else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; +end: if (gfx_info[GFX_rc6].path) BIC_PRESENT(BIC_GFX_rc6); if (gfx_info[GFX_MHz].path) -- cgit v1.2.3 From 91a91d389543a86963beec148d98d37875154bd4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 23:56:02 +0800 Subject: tools/power/turbostat: Add support for Xe sysfs knobs Xe graphics driver uses different graphics sysfs knobs including /sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms /sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq /sys/class/drm/card0/device/tile0/gt0/freq0/act_freq /sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms /sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq /sys/class/drm/card0/device/tile0/gt1/freq0/act_freq Plus that, /sys/class/drm/card0/device/tile0/gt/gtidle/name returns either gt-rc or gt-mc. rc is for GFX and mc is SA Media. Enhance turbostat to prefer the Xe sysfs knobs when they are available. Export gt-rc via BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz. Export gt-mc via BIC_SMA_mc6/BIC_SMAMHz/BIC_SMAACTMHz. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index feca7f4cb5cd..bc103851df70 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5380,6 +5380,57 @@ probe_cluster: static void probe_graphics(void) { + /* Xe graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { + FILE *fp; + char buf[8]; + bool gt0_is_gt; + int idx; + + fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); + if (!fp) + goto next; + + if (!fread(buf, sizeof(char), 7, fp)) { + fclose(fp); + goto next; + } + fclose(fp); + + if (!strncmp(buf, "gt0-rc", strlen("gt0-rc"))) + gt0_is_gt = true; + else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc"))) + gt0_is_gt = false; + else + goto next; + + idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? GFX_MHz : SAM_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + + idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + + idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; + if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? SAM_MHz : GFX_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + + idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + + goto end; + } + +next: /* New i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; -- cgit v1.2.3 From 3ab7296a7e6aa34634dcc2926af933107a117996 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 8 Apr 2024 19:32:58 -0400 Subject: tools/power turbostat: v2024.04.10 Much of turbostat can now run with perf, rather than using the MSR driver Some of turbostat can now run as a regular non-root user. Add some new output columns for some new GFX hardware. [This patch updates the version, but otherwise changes no function; it touches up some checkpatch issues from previous patches] Signed-off-by: Len Brown --- MAINTAINERS | 1 + tools/power/x86/turbostat/turbostat.c | 41 ++++++++++++++++--------- tools/testing/selftests/turbostat/defcolumns.py | 1 + 3 files changed, 28 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index a7c4cf8201e0..b8582a466128 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22116,6 +22116,7 @@ Q: https://patchwork.kernel.org/project/linux-pm/list/ B: https://bugzilla.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat F: tools/power/x86/turbostat/ +F: tools/testing/selftests/turbostat/ TW5864 VIDEO4LINUX DRIVER M: Bluecherry Maintainers diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bc103851df70..98256468e248 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2023 Intel Corporation. + * Copyright (c) 2024 Intel Corporation. * Len Brown */ @@ -1360,6 +1360,7 @@ struct sys_counters { void free_sys_counters(void) { struct msr_counter *p = sys.tp, *pnext = NULL; + while (p) { pnext = p->next; free(p); @@ -1979,6 +1980,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; const double energy_scale = c->core_energy.scale; + if (c->core_energy.unit == RAPL_UNIT_JOULES) outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); @@ -3153,7 +3155,7 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned read_msr_type(void) +static unsigned int read_msr_type(void) { const char *const path = "/sys/bus/event_source/devices/msr/type"; const char *const format = "%u"; @@ -3161,7 +3163,7 @@ static unsigned read_msr_type(void) return read_perf_counter_info_n(path, format); } -static unsigned read_aperf_config(void) +static unsigned int read_aperf_config(void) { const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; const char *const format = "event=%x"; @@ -3169,7 +3171,7 @@ static unsigned read_aperf_config(void) return read_perf_counter_info_n(path, format); } -static unsigned read_mperf_config(void) +static unsigned int read_mperf_config(void) { const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; const char *const format = "event=%x"; @@ -3177,7 +3179,7 @@ static unsigned read_mperf_config(void) return read_perf_counter_info_n(path, format); } -static unsigned read_perf_type(const char *subsys) +static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; const char *const format = "%u"; @@ -3188,7 +3190,7 @@ static unsigned read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_rapl_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; const char *const format = "event=%x"; @@ -3199,7 +3201,7 @@ static unsigned read_rapl_config(const char *subsys, const char *event_name) return read_perf_counter_info_n(path, format); } -static unsigned read_perf_rapl_unit(const char *subsys, const char *event_name) +static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; const char *const format = "%s"; @@ -3235,7 +3237,7 @@ static struct amperf_group_fd open_amperf_fd(int cpu) const unsigned int msr_type = read_msr_type(); const unsigned int aperf_config = read_aperf_config(); const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; + struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); @@ -3277,6 +3279,7 @@ static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) t->tsc = rdtsc(); const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + if (n != sizeof(cnt.as_array)) return -2; @@ -3371,7 +3374,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; if (debug) - fprintf(stderr, "get_rapl_counters: cpu%d domain%d\n", cpu, domain); + fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3382,8 +3385,9 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data size_t num_perf_counters = rapl_counter_info_count_perf(rci); const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + if (actual_read_size != expected_read_size) - err(-1, "get_rapl_counters: failed to read perf_data (%zu %zu)", expected_read_size, + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size); } @@ -3454,7 +3458,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int status; if (cpu_migrate(cpu)) { - fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); + fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu); return -1; } @@ -6411,15 +6415,17 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + if (scale == 0.0) return -1; const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + if (unit == RAPL_UNIT_INVALID) return -1; - const unsigned rapl_type = read_perf_type(cai->perf_subsys); - const unsigned rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_type = read_perf_type(cai->perf_subsys); + const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -6441,7 +6447,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); if (debug) - fprintf(stderr, "add_rapl_perf_counter: %d (cpu: %d)\n", ret, cpu); + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; } @@ -6462,6 +6468,7 @@ void linux_perf_init(void) } const bool aperf_required = is_aperf_access_required(); + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); if (fd_amperf_percpu == NULL) @@ -6483,6 +6490,7 @@ void rapl_perf_init(void) */ for (int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; @@ -7296,6 +7304,7 @@ static void set_amperf_source(void) amperf_source = AMPERF_SOURCE_PERF; const bool aperf_required = is_aperf_access_required(); + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) amperf_source = AMPERF_SOURCE_MSR; @@ -7373,10 +7382,12 @@ void check_msr_access(void) void check_perf_access(void) { const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + if (no_perf || !intrcount_required || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; const bool aperf_required = is_aperf_access_required(); + if (!aperf_required || !has_amperf_access()) { bic_enabled &= ~BIC_Avg_MHz; bic_enabled &= ~BIC_Busy; @@ -7486,7 +7497,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.11.07 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.04.08 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py index 70d3b7780311..d9b042097da7 100755 --- a/tools/testing/selftests/turbostat/defcolumns.py +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -1,4 +1,5 @@ #!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 import subprocess from shutil import which -- cgit v1.2.3 From 076361362122a6d8a4c45f172ced5576b2d4a50d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 9 Apr 2024 13:22:12 -0700 Subject: selftests: timers: Fix valid-adjtimex signed left-shift undefined behavior The struct adjtimex freq field takes a signed value who's units are in shifted (<<16) parts-per-million. Unfortunately for negative adjustments, the straightforward use of: freq = ppm << 16 trips undefined behavior warnings with clang: valid-adjtimex.c:66:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -499<<16, ~~~~^ valid-adjtimex.c:67:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -450<<16, ~~~~^ .. Fix it by using a multiply by (1 << 16) instead of shifting negative values in the valid-adjtimex test case. Align the values for better readability. Reported-by: Lee Jones Reported-by: Muhammad Usama Anjum Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240409202222.2830476-1-jstultz@google.com Link: https://lore.kernel.org/lkml/0c6d4f0d-2064-4444-986b-1d1ed782135f@collabora.com/ --- tools/testing/selftests/timers/valid-adjtimex.c | 73 ++++++++++++------------- 1 file changed, 36 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 48b9a803235a..d13ebde20322 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -21,9 +21,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - - - #include #include #include @@ -62,45 +59,47 @@ int clear_time_state(void) #define NUM_FREQ_OUTOFRANGE 4 #define NUM_FREQ_INVALID 2 +#define SHIFTED_PPM (1 << 16) + long valid_freq[NUM_FREQ_VALID] = { - -499<<16, - -450<<16, - -400<<16, - -350<<16, - -300<<16, - -250<<16, - -200<<16, - -150<<16, - -100<<16, - -75<<16, - -50<<16, - -25<<16, - -10<<16, - -5<<16, - -1<<16, + -499 * SHIFTED_PPM, + -450 * SHIFTED_PPM, + -400 * SHIFTED_PPM, + -350 * SHIFTED_PPM, + -300 * SHIFTED_PPM, + -250 * SHIFTED_PPM, + -200 * SHIFTED_PPM, + -150 * SHIFTED_PPM, + -100 * SHIFTED_PPM, + -75 * SHIFTED_PPM, + -50 * SHIFTED_PPM, + -25 * SHIFTED_PPM, + -10 * SHIFTED_PPM, + -5 * SHIFTED_PPM, + -1 * SHIFTED_PPM, -1000, - 1<<16, - 5<<16, - 10<<16, - 25<<16, - 50<<16, - 75<<16, - 100<<16, - 150<<16, - 200<<16, - 250<<16, - 300<<16, - 350<<16, - 400<<16, - 450<<16, - 499<<16, + 1 * SHIFTED_PPM, + 5 * SHIFTED_PPM, + 10 * SHIFTED_PPM, + 25 * SHIFTED_PPM, + 50 * SHIFTED_PPM, + 75 * SHIFTED_PPM, + 100 * SHIFTED_PPM, + 150 * SHIFTED_PPM, + 200 * SHIFTED_PPM, + 250 * SHIFTED_PPM, + 300 * SHIFTED_PPM, + 350 * SHIFTED_PPM, + 400 * SHIFTED_PPM, + 450 * SHIFTED_PPM, + 499 * SHIFTED_PPM, }; long outofrange_freq[NUM_FREQ_OUTOFRANGE] = { - -1000<<16, - -550<<16, - 550<<16, - 1000<<16, + -1000 * SHIFTED_PPM, + -550 * SHIFTED_PPM, + 550 * SHIFTED_PPM, + 1000 * SHIFTED_PPM, }; #define LONG_MAX (~0UL>>1) -- cgit v1.2.3 From f971f6dd3742d22dd13710306fb4365ea7bcb536 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Fri, 22 Mar 2024 06:46:02 -0700 Subject: hv/hv_kvp_daemon: Handle IPv4 and Ipv6 combination for keyfile format If the network configuration strings are passed as a combination of IPv4 and IPv6 addresses, the current KVP daemon does not handle processing for the keyfile configuration format. With these changes, the keyfile config generation logic scans through the list twice to generate IPv4 and IPv6 sections for the configuration files to handle this support. Testcases ran:Rhel 9, Hyper-V VMs (IPv4 only, IPv6 only, IPv4 and IPv6 combination) Co-developed-by: Ani Sinha Signed-off-by: Ani Sinha Signed-off-by: Shradha Gupta Reviewed-by: Easwar Hariharan Tested-by: Ani Sinha Reviewed-by: Ani Sinha Link: https://lore.kernel.org/r/1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com> --- tools/hv/hv_kvp_daemon.c | 213 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 172 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 318e2dad27e0..ae57bf69ad4a 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -76,6 +76,12 @@ enum { DNS }; +enum { + IPV4 = 1, + IPV6, + IP_TYPE_MAX +}; + static int in_hand_shake; static char *os_name = ""; @@ -102,6 +108,11 @@ static struct utsname uts_buf; #define MAX_FILE_NAME 100 #define ENTRIES_PER_BLOCK 50 +/* + * Change this entry if the number of addresses increases in future + */ +#define MAX_IP_ENTRIES 64 +#define OUTSTR_BUF_SIZE ((INET6_ADDRSTRLEN + 1) * MAX_IP_ENTRIES) struct kvp_record { char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; @@ -1171,6 +1182,18 @@ static int process_ip_string(FILE *f, char *ip_string, int type) return 0; } +int ip_version_check(const char *input_addr) +{ + struct in6_addr addr; + + if (inet_pton(AF_INET, input_addr, &addr)) + return IPV4; + else if (inet_pton(AF_INET6, input_addr, &addr)) + return IPV6; + + return -EINVAL; +} + /* * Only IPv4 subnet strings needs to be converted to plen * For IPv6 the subnet is already privided in plen format @@ -1197,14 +1220,75 @@ static int kvp_subnet_to_plen(char *subnet_addr_str) return plen; } +static int process_dns_gateway_nm(FILE *f, char *ip_string, int type, + int ip_sec) +{ + char addr[INET6_ADDRSTRLEN], *output_str; + int ip_offset = 0, error = 0, ip_ver; + char *param_name; + + if (type == DNS) + param_name = "dns"; + else if (type == GATEWAY) + param_name = "gateway"; + else + return -EINVAL; + + output_str = (char *)calloc(OUTSTR_BUF_SIZE, sizeof(char)); + if (!output_str) + return -ENOMEM; + + while (1) { + memset(addr, 0, sizeof(addr)); + + if (!parse_ip_val_buffer(ip_string, &ip_offset, addr, + (MAX_IP_ADDR_SIZE * 2))) + break; + + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if ((ip_ver == IPV4 && ip_sec == IPV4) || + (ip_ver == IPV6 && ip_sec == IPV6)) { + /* + * do a bound check to avoid out-of bound writes + */ + if ((OUTSTR_BUF_SIZE - strlen(output_str)) > + (strlen(addr) + 1)) { + strncat(output_str, addr, + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + strncat(output_str, ",", + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + } + } else { + continue; + } + } + + if (strlen(output_str)) { + /* + * This is to get rid of that extra comma character + * in the end of the string + */ + output_str[strlen(output_str) - 1] = '\0'; + error = fprintf(f, "%s=%s\n", param_name, output_str); + } + + free(output_str); + return error; +} + static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, - int is_ipv6) + int ip_sec) { char addr[INET6_ADDRSTRLEN]; char subnet_addr[INET6_ADDRSTRLEN]; - int error, i = 0; + int error = 0, i = 0; int ip_offset = 0, subnet_offset = 0; - int plen; + int plen, ip_ver; memset(addr, 0, sizeof(addr)); memset(subnet_addr, 0, sizeof(subnet_addr)); @@ -1216,10 +1300,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, subnet_addr, (MAX_IP_ADDR_SIZE * 2))) { - if (!is_ipv6) + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if (ip_ver == IPV4 && ip_sec == IPV4) plen = kvp_subnet_to_plen((char *)subnet_addr); - else + else if (ip_ver == IPV6 && ip_sec == IPV6) plen = atoi(subnet_addr); + else + continue; if (plen < 0) return plen; @@ -1233,17 +1323,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, memset(subnet_addr, 0, sizeof(subnet_addr)); } - return 0; + return error; } static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) { - int error = 0; + int error = 0, ip_ver; char if_filename[PATH_MAX]; char nm_filename[PATH_MAX]; FILE *ifcfg_file, *nmfile; char cmd[PATH_MAX]; - int is_ipv6 = 0; char *mac_addr; int str_len; @@ -1421,52 +1510,94 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error) goto setval_error; - if (new_val->addr_family & ADDR_FAMILY_IPV6) { - error = fprintf(nmfile, "\n[ipv6]\n"); - if (error < 0) - goto setval_error; - is_ipv6 = 1; - } else { - error = fprintf(nmfile, "\n[ipv4]\n"); - if (error < 0) - goto setval_error; - } - /* * Now we populate the keyfile format + * + * The keyfile format expects the IPv6 and IPv4 configuration in + * different sections. Therefore we iterate through the list twice, + * once to populate the IPv4 section and the next time for IPv6 */ + ip_ver = IPV4; + do { + if (ip_ver == IPV4) { + error = fprintf(nmfile, "\n[ipv4]\n"); + if (error < 0) + goto setval_error; + } else { + error = fprintf(nmfile, "\n[ipv6]\n"); + if (error < 0) + goto setval_error; + } - if (new_val->dhcp_enabled) { - error = kvp_write_file(nmfile, "method", "", "auto"); - if (error < 0) - goto setval_error; - } else { - error = kvp_write_file(nmfile, "method", "", "manual"); + /* + * Write the configuration for ipaddress, netmask, gateway and + * name services + */ + error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, + (char *)new_val->sub_net, + ip_ver); if (error < 0) goto setval_error; - } - /* - * Write the configuration for ipaddress, netmask, gateway and - * name services - */ - error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, - (char *)new_val->sub_net, is_ipv6); - if (error < 0) - goto setval_error; + /* + * As dhcp_enabled is only valid for ipv4, we do not set dhcp + * methods for ipv6 based on dhcp_enabled flag. + * + * For ipv4, set method to manual only when dhcp_enabled is + * false and specific ipv4 addresses are configured. If neither + * dhcp_enabled is true and no ipv4 addresses are configured, + * set method to 'disabled'. + * + * For ipv6, set method to manual when we configure ipv6 + * addresses. Otherwise set method to 'auto' so that SLAAC from + * RA may be used. + */ + if (ip_ver == IPV4) { + if (new_val->dhcp_enabled) { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } else if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "disabled"); + if (error < 0) + goto setval_error; + } + } else if (ip_ver == IPV6) { + if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } + } - /* we do not want ipv4 addresses in ipv6 section and vice versa */ - if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) { - error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->gate_way, + GATEWAY, ip_ver); if (error < 0) goto setval_error; - } - if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) { - error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->dns_addr, DNS, + ip_ver); if (error < 0) goto setval_error; - } + + ip_ver++; + } while (ip_ver < IP_TYPE_MAX); + fclose(nmfile); fclose(ifcfg_file); -- cgit v1.2.3 From 2b8dbf69ec60faf6c7db49e57d7f316409ccec92 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Apr 2024 14:17:57 -0700 Subject: perf annotate: Make sure to call symbol__annotate2() in TUI The symbol__annotate2() initializes some data structures needed by TUI. It has a logic to prevent calling it multiple times by checking if it has the annotated source. But data type profiling uses a different code (symbol__annotate) to allocate the annotated lines in advance. So TUI missed to call symbol__annotate2() when it shows the annotation browser. Make symbol__annotate() reentrant and handle that situation properly. This fixes a crash in the annotation browser started by perf report in TUI like below. $ perf report -s type,sym --tui # and press 'a' key and then move down Fixes: 81e57deec325 ("perf report: Support data type profiling") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240405211800.1412920-2-namhyung@kernel.org --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ec5e21932876..4790c735599b 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -970,7 +970,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, if (dso->annotate_warned) return -1; - if (not_annotated) { + if (not_annotated || !sym->annotate2) { err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ac002d907d81..50ca92255ff6 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2461,6 +2461,9 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (parch) *parch = arch; + if (!list_empty(¬es->src->source)) + return 0; + args.arch = arch; args.ms = *ms; if (annotate_opts.full_addr) -- cgit v1.2.3 From f3408580bac8ce5cd76e7391e529c0a22e7c7eb2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Apr 2024 15:55:42 -0700 Subject: perf lock contention: Add a missing NULL check I got a report for a failure in BPF verifier on a recent kernel with perf lock contention command. It checks task->sighand->siglock without checking if sighand is NULL or not. Let's add one. ; if (&curr->sighand->siglock == (void *)lock) 265: (79) r1 = *(u64 *)(r0 +2624) ; frame1: R0_w=trusted_ptr_task_struct(off=0,imm=0) ; R1_w=rcu_ptr_or_null_sighand_struct(off=0,imm=0) 266: (b7) r2 = 0 ; frame1: R2_w=0 267: (0f) r1 += r2 R1 pointer arithmetic on rcu_ptr_or_null_ prohibited, null-check it first processed 164 insns (limit 1000000) max_states_per_insn 1 total_states 15 peak_states 15 mark_read 5 -- END PROG LOAD LOG -- libbpf: prog 'contention_end': failed to load: -13 libbpf: failed to load object 'lock_contention_bpf' libbpf: failed to load BPF skeleton 'lock_contention_bpf': -13 Failed to load lock-contention BPF skeleton lock contention BPF setup failed lock contention did not detect any lock contention Fixes: 1811e82767dcc ("perf lock contention: Track and show siglock with address") Reviewed-by: Ian Rogers Acked-by: Arnaldo Carvalho de Melo Cc: Song Liu Cc: bpf@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240409225542.1870999-1-namhyung@kernel.org --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index fb54bd38e7d0..d931a898c434 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -284,6 +284,7 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) struct task_struct *curr; struct mm_struct___old *mm_old; struct mm_struct___new *mm_new; + struct sighand_struct *sighand; switch (flags) { case LCB_F_READ: /* rwsem */ @@ -305,7 +306,9 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) break; case LCB_F_SPIN: /* spinlock */ curr = bpf_get_current_task_btf(); - if (&curr->sighand->siglock == (void *)lock) + sighand = curr->sighand; + + if (sighand && &sighand->siglock == (void *)lock) return LCD_F_SIGHAND_LOCK; break; default: -- cgit v1.2.3 From 3ef842a77e7cdf757fe3f1d2999aa2cc88eb53ba Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:12 -0700 Subject: tools/include: Sync uapi/drm/i915_drm.h with the kernel sources To pick up changes from: b112364867499 ("drm/i915: Add GuC submission interface version query") 5cf0fbf763741 ("drm/i915: Add some boring kerneldoc") This should be used to beautify DRM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-2-namhyung@kernel.org --- tools/include/uapi/drm/i915_drm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index fd4f9574d177..2ee338860b7e 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -3013,6 +3013,7 @@ struct drm_i915_query_item { * - %DRM_I915_QUERY_MEMORY_REGIONS (see struct drm_i915_query_memory_regions) * - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`) * - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct drm_i915_query_topology_info) + * - %DRM_I915_QUERY_GUC_SUBMISSION_VERSION (see struct drm_i915_query_guc_submission_version) */ __u64 query_id; #define DRM_I915_QUERY_TOPOLOGY_INFO 1 @@ -3021,6 +3022,7 @@ struct drm_i915_query_item { #define DRM_I915_QUERY_MEMORY_REGIONS 4 #define DRM_I915_QUERY_HWCONFIG_BLOB 5 #define DRM_I915_QUERY_GEOMETRY_SUBSLICES 6 +#define DRM_I915_QUERY_GUC_SUBMISSION_VERSION 7 /* Must be kept compact -- no holes and well documented */ /** @@ -3566,6 +3568,20 @@ struct drm_i915_query_memory_regions { struct drm_i915_memory_region_info regions[]; }; +/** + * struct drm_i915_query_guc_submission_version - query GuC submission interface version + */ +struct drm_i915_query_guc_submission_version { + /** @branch: Firmware branch version. */ + __u32 branch; + /** @major: Firmware major version. */ + __u32 major; + /** @minor: Firmware minor version. */ + __u32 minor; + /** @patch: Firmware patch version. */ + __u32 patch; +}; + /** * DOC: GuC HWCONFIG blob uAPI * -- cgit v1.2.3 From 4cfa8a873d3e3a87894f8de056ee69a857b5adcd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:13 -0700 Subject: tools/include: Sync uapi/linux/fs.h with the kernel sources To pick up the changes from: 41bcbe59c3b3f ("fs: FS_IOC_GETUUID") ae8c511757304 ("fs: add FS_IOC_GETFSSYSFSPATH") 73fa7547c70b3 ("vfs: add RWF_NOAPPEND flag for pwritev2") This should be used to beautify fs syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-3-namhyung@kernel.org --- tools/include/uapi/linux/fs.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 48ad69f7722e..45e4e64fd664 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -64,6 +64,24 @@ struct fstrim_range { __u64 minlen; }; +/* + * We include a length field because some filesystems (vfat) have an identifier + * that we do want to expose as a UUID, but doesn't have the standard length. + * + * We use a fixed size buffer beacuse this interface will, by fiat, never + * support "UUIDs" longer than 16 bytes; we don't want to force all downstream + * users to have to deal with that. + */ +struct fsuuid2 { + __u8 len; + __u8 uuid[16]; +}; + +struct fs_sysfs_path { + __u8 len; + __u8 name[128]; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -215,6 +233,13 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +/* Returns the external filesystem UUID, the same one blkid returns */ +#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2) +/* + * Returns the path component under /sys/fs/ that refers to this filesystem; + * also /sys/kernel/debug/ for filesystems with debugfs exports + */ +#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) @@ -301,9 +326,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) -- cgit v1.2.3 From bee3b820c66a6aae0e16d0ac47f9744446f33bff Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:14 -0700 Subject: tools/include: Sync uapi/linux/kvm.h and asm/kvm.h with the kernel sources To pick up the changes from: 6bda055d6258 ("KVM: define __KVM_HAVE_GUEST_DEBUG unconditionally") 5d9cb71642db ("KVM: arm64: move ARM-specific defines to uapi/asm/kvm.h") 71cd774ad2f9 ("KVM: s390: move s390-specific structs to uapi/asm/kvm.h") d750951c9ed7 ("KVM: powerpc: move powerpc-specific structs to uapi/asm/kvm.h") bcac0477277e ("KVM: x86: move x86-specific structs to uapi/asm/kvm.h") c0a411904e15 ("KVM: remove more traces of device assignment UAPI") f3c80061c0d3 ("KVM: SEV: fix compat ABI for KVM_MEMORY_ENCRYPT_OP") That should be used to beautify the KVM arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h diff -u tools/arch/powerpc/include/uapi/asm/kvm.h arch/powerpc/include/uapi/asm/kvm.h diff -u tools/arch/s390/include/uapi/asm/kvm.h arch/s390/include/uapi/asm/kvm.h diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-4-namhyung@kernel.org --- tools/arch/arm64/include/uapi/asm/kvm.h | 15 +- tools/arch/powerpc/include/uapi/asm/kvm.h | 45 +- tools/arch/s390/include/uapi/asm/kvm.h | 315 +++++++++++++- tools/arch/x86/include/uapi/asm/kvm.h | 308 ++++++++++++- tools/include/uapi/linux/kvm.h | 689 +----------------------------- 5 files changed, 672 insertions(+), 700 deletions(-) (limited to 'tools') diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 89d2fc872d9f..964df31da975 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -37,9 +37,7 @@ #include #include -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_READONLY_MEM #define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -76,11 +74,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ - KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK __GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ - KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK __GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 @@ -162,6 +160,11 @@ struct kvm_sync_regs { __u64 device_irq_level; }; +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + /* * PMU filter structure. Describe a range of events with a particular * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 9f18fa090f1f..1691297a766a 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -28,7 +28,6 @@ #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_GUEST_DEBUG /* Not always available, but if it is, this is the correct offset. */ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -733,4 +732,48 @@ struct kvm_ppc_xive_eq { #define KVM_XIVE_TIMA_PAGE_OFFSET 0 #define KVM_XIVE_ESB_PAGE_OFFSET 4 +/* for KVM_PPC_GET_PVINFO */ + +#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +/* for KVM_PPC_GET_SMMU_INFO */ +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 + +struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ +}; + +struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 +#define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 + +struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index abe926d43cbe..05eaf6db3ad4 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -12,7 +12,320 @@ #include #define __KVM_S390 -#define __KVM_HAVE_GUEST_DEBUG + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + union { + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + __u8 pad1[6]; /* ignored */ + __u64 old_addr; /* ignored if cmpxchg flag unset */ + }; + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* ignored */ + }; +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +#define KVM_S390_MEMOP_SIDA_READ 2 +#define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 +#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 + +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) + +/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ +#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) +#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u +#define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u +/* Anything below 0xfffe0000u is taken by INT_IO */ +#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ + (((schid)) | \ + ((ssid) << 16) | \ + ((cssid) << 18) | \ + ((ai) << 26)) +#define KVM_S390_INT_IO_MIN 0x00000000u +#define KVM_S390_INT_IO_MAX 0xfffdffffu +#define KVM_S390_INT_IO_AI_MASK 0x04000000u + + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; +#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 +#define KVM_S390_PGM_FLAGS_ILC_0 0x02 +#define KVM_S390_PGM_FLAGS_ILC_1 0x04 +#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 +#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 + __u8 flags; + __u8 pad[2]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 +struct kvm_s390_stop_info { + __u32 flags; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; + +struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +}; + +struct kvm_s390_pv_unp { + __u64 addr; + __u64 size; + __u64 tweak; +}; + +enum pv_cmd_dmp_id { + KVM_PV_DUMP_INIT, + KVM_PV_DUMP_CONFIG_STOR_STATE, + KVM_PV_DUMP_COMPLETE, + KVM_PV_DUMP_CPU, +}; + +struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + __u64 reserved[4]; +}; + +enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, +}; + +struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; +}; + +struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; +}; + +struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; +}; + +struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + union { + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; +}; + +enum pv_cmd_id { + KVM_PV_ENABLE, + KVM_PV_DISABLE, + KVM_PV_SET_SEC_PARMS, + KVM_PV_UNPACK, + KVM_PV_VERIFY, + KVM_PV_PREP_RESET, + KVM_PV_UNSHARE_ALL, + KVM_PV_INFO, + KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, +}; + +struct kvm_pv_cmd { + __u32 cmd; /* Command to be executed */ + __u16 rc; /* Ultravisor return code */ + __u16 rrc; /* Ultravisor return reason code */ + __u64 data; /* Data or address */ + __u32 flags; /* flags for future extensions. Must be 0 for now */ + __u32 reserved[3]; +}; + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) /* Device control API: s390-specific devices */ #define KVM_DEV_FLIC_GET_ALL_IRQS 1 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index a448d0964fc0..ef11aa4cab42 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -7,6 +7,8 @@ * */ +#include +#include #include #include #include @@ -40,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -49,7 +50,6 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -526,9 +526,301 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 -#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u32 pad0; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u32 pad0; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u32 pad1; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u32 pad2; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u32 pad3; + __u64 session_uaddr; + __u32 session_len; + __u32 pad4; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + /* * Masked event layout. * Bits Description @@ -549,10 +841,10 @@ struct kvm_pmu_event_filter { ((__u64)(!!(exclude)) << 55)) #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) -#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ @@ -560,7 +852,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ /* x86-specific KVM_EXIT_HYPERCALL flags. */ -#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c3308536482b..2190adbe3002 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -16,6 +16,11 @@ #define KVM_API_VERSION 12 +/* + * Backwards-compatible definitions. + */ +#define __KVM_HAVE_GUEST_DEBUG + /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -85,43 +90,6 @@ struct kvm_pit_config { #define KVM_PIT_SPEAKER_DUMMY 1 -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -#define KVM_S390_CMMA_PEEK (1 << 0) - -/** - * kvm_s390_cmma_log - Used for CMMA migration. - * - * Used both for input and output. - * - * @start_gfn: Guest page number to start from. - * @count: Size of the result buffer. - * @flags: Control operation mode via KVM_S390_CMMA_* flags - * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty - * pages are still remaining. - * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set - * in the PGSTE. - * @values: Pointer to the values buffer. - * - * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. - */ -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 @@ -315,11 +283,6 @@ struct kvm_run { __u32 ipb; } s390_sieic; /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 __u64 s390_reset_flags; /* KVM_EXIT_S390_UCONTROL */ struct { @@ -536,43 +499,6 @@ struct kvm_translation { __u8 pad[5]; }; -/* for KVM_S390_MEM_OP */ -struct kvm_s390_mem_op { - /* in */ - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - union { - struct { - __u8 ar; /* the access register number */ - __u8 key; /* access key, ignored if flag unset */ - __u8 pad1[6]; /* ignored */ - __u64 old_addr; /* ignored if cmpxchg flag unset */ - }; - __u32 sida_offset; /* offset into the sida */ - __u8 reserved[32]; /* ignored */ - }; -}; -/* types for kvm_s390_mem_op->op */ -#define KVM_S390_MEMOP_LOGICAL_READ 0 -#define KVM_S390_MEMOP_LOGICAL_WRITE 1 -#define KVM_S390_MEMOP_SIDA_READ 2 -#define KVM_S390_MEMOP_SIDA_WRITE 3 -#define KVM_S390_MEMOP_ABSOLUTE_READ 4 -#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 -#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 - -/* flags for kvm_s390_mem_op->flags */ -#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) -#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) -#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) - -/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ -#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) -#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) - /* for KVM_INTERRUPT */ struct kvm_interrupt { /* in */ @@ -637,124 +563,6 @@ struct kvm_mp_state { __u32 mp_state; }; -struct kvm_s390_psw { - __u64 mask; - __u64 addr; -}; - -/* valid values for type in kvm_s390_interrupt */ -#define KVM_S390_SIGP_STOP 0xfffe0000u -#define KVM_S390_PROGRAM_INT 0xfffe0001u -#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u -#define KVM_S390_RESTART 0xfffe0003u -#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u -#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u -#define KVM_S390_MCHK 0xfffe1000u -#define KVM_S390_INT_CLOCK_COMP 0xffff1004u -#define KVM_S390_INT_CPU_TIMER 0xffff1005u -#define KVM_S390_INT_VIRTIO 0xffff2603u -#define KVM_S390_INT_SERVICE 0xffff2401u -#define KVM_S390_INT_EMERGENCY 0xffff1201u -#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u -/* Anything below 0xfffe0000u is taken by INT_IO */ -#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ - (((schid)) | \ - ((ssid) << 16) | \ - ((cssid) << 18) | \ - ((ai) << 26)) -#define KVM_S390_INT_IO_MIN 0x00000000u -#define KVM_S390_INT_IO_MAX 0xfffdffffu -#define KVM_S390_INT_IO_AI_MASK 0x04000000u - - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -struct kvm_s390_io_info { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; -}; - -struct kvm_s390_ext_info { - __u32 ext_params; - __u32 pad; - __u64 ext_params2; -}; - -struct kvm_s390_pgm_info { - __u64 trans_exc_code; - __u64 mon_code; - __u64 per_address; - __u32 data_exc_code; - __u16 code; - __u16 mon_class_nr; - __u8 per_code; - __u8 per_atmid; - __u8 exc_access_id; - __u8 per_access_id; - __u8 op_access_id; -#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 -#define KVM_S390_PGM_FLAGS_ILC_0 0x02 -#define KVM_S390_PGM_FLAGS_ILC_1 0x04 -#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 -#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 - __u8 flags; - __u8 pad[2]; -}; - -struct kvm_s390_prefix_info { - __u32 address; -}; - -struct kvm_s390_extcall_info { - __u16 code; -}; - -struct kvm_s390_emerg_info { - __u16 code; -}; - -#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 -struct kvm_s390_stop_info { - __u32 flags; -}; - -struct kvm_s390_mchk_info { - __u64 cr14; - __u64 mcic; - __u64 failing_storage_address; - __u32 ext_damage_code; - __u32 pad; - __u8 fixed_logout[16]; -}; - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 @@ -810,50 +618,6 @@ struct kvm_enable_cap { __u8 pad[64]; }; -/* for KVM_PPC_GET_PVINFO */ - -#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -struct kvm_ppc_pvinfo { - /* out */ - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -/* for KVM_PPC_GET_SMMU_INFO */ -#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 - -struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ -}; - -struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 -#define KVM_PPC_1T_SEGMENTS 0x00000002 -#define KVM_PPC_NO_HASH 0x00000004 - -struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u16 data_keys; /* # storage keys supported for data */ - __u16 instr_keys; /* # storage keys supported for instructions */ - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -923,9 +687,7 @@ struct kvm_ppc_resize_hpt { /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 #define KVM_CAP_USER_NMI 22 -#ifdef __KVM_HAVE_GUEST_DEBUG #define KVM_CAP_SET_GUEST_DEBUG 23 -#endif #ifdef __KVM_HAVE_PIT #define KVM_CAP_REINJECT_CONTROL 24 #endif @@ -1156,8 +918,6 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 -#ifdef KVM_CAP_IRQ_ROUTING - struct kvm_irq_routing_irqchip { __u32 irqchip; __u32 pin; @@ -1222,42 +982,6 @@ struct kvm_irq_routing { struct kvm_irq_routing_entry entries[]; }; -#endif - -#ifdef KVM_CAP_MCE -/* x86 MCE */ -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; -#endif - -#ifdef KVM_CAP_XEN_HVM -#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) -#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) -#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) -#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) -#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) -#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) -#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) -#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; -#endif - #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) /* * Available with KVM_CAP_IRQFD_RESAMPLE @@ -1442,11 +1166,6 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ -struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; -}; #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) @@ -1641,89 +1360,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -struct kvm_s390_pv_sec_parm { - __u64 origin; - __u64 length; -}; - -struct kvm_s390_pv_unp { - __u64 addr; - __u64 size; - __u64 tweak; -}; - -enum pv_cmd_dmp_id { - KVM_PV_DUMP_INIT, - KVM_PV_DUMP_CONFIG_STOR_STATE, - KVM_PV_DUMP_COMPLETE, - KVM_PV_DUMP_CPU, -}; - -struct kvm_s390_pv_dmp { - __u64 subcmd; - __u64 buff_addr; - __u64 buff_len; - __u64 gaddr; /* For dump storage state */ - __u64 reserved[4]; -}; - -enum pv_cmd_info_id { - KVM_PV_INFO_VM, - KVM_PV_INFO_DUMP, -}; - -struct kvm_s390_pv_info_dump { - __u64 dump_cpu_buffer_len; - __u64 dump_config_mem_buffer_per_1m; - __u64 dump_config_finalize_len; -}; - -struct kvm_s390_pv_info_vm { - __u64 inst_calls_list[4]; - __u64 max_cpus; - __u64 max_guests; - __u64 max_guest_addr; - __u64 feature_indication; -}; - -struct kvm_s390_pv_info_header { - __u32 id; - __u32 len_max; - __u32 len_written; - __u32 reserved; -}; - -struct kvm_s390_pv_info { - struct kvm_s390_pv_info_header header; - union { - struct kvm_s390_pv_info_dump dump; - struct kvm_s390_pv_info_vm vm; - }; -}; - -enum pv_cmd_id { - KVM_PV_ENABLE, - KVM_PV_DISABLE, - KVM_PV_SET_SEC_PARMS, - KVM_PV_UNPACK, - KVM_PV_VERIFY, - KVM_PV_PREP_RESET, - KVM_PV_UNSHARE_ALL, - KVM_PV_INFO, - KVM_PV_DUMP, - KVM_PV_ASYNC_CLEANUP_PREPARE, - KVM_PV_ASYNC_CLEANUP_PERFORM, -}; - -struct kvm_pv_cmd { - __u32 cmd; /* Command to be executed */ - __u16 rc; /* Ultravisor return code */ - __u16 rrc; /* Ultravisor return reason code */ - __u64 data; /* Data or address */ - __u32 flags; /* flags for future extensions. Must be 0 for now */ - __u32 reserved[3]; -}; - /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) @@ -1737,58 +1373,6 @@ struct kvm_pv_cmd { #define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) #define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) -struct kvm_xen_hvm_attr { - __u16 type; - __u16 pad[3]; - union { - __u8 long_mode; - __u8 vector; - __u8 runstate_update_flag; - struct { - __u64 gfn; -#define KVM_XEN_INVALID_GFN ((__u64)-1) - } shared_info; - struct { - __u32 send_port; - __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ - __u32 flags; -#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) -#define KVM_XEN_EVTCHN_UPDATE (1 << 1) -#define KVM_XEN_EVTCHN_RESET (1 << 2) - /* - * Events sent by the guest are either looped back to - * the guest itself (potentially on a different port#) - * or signalled via an eventfd. - */ - union { - struct { - __u32 port; - __u32 vcpu; - __u32 priority; - } port; - struct { - __u32 port; /* Zero for eventfd */ - __s32 fd; - } eventfd; - __u32 padding[4]; - } deliver; - } evtchn; - __u32 xen_version; - __u64 pad[8]; - } u; -}; - - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 -#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 -#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 -#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ -#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 - /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) @@ -1799,242 +1383,6 @@ struct kvm_xen_hvm_attr { #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) -struct kvm_xen_vcpu_attr { - __u16 type; - __u16 pad[3]; - union { - __u64 gpa; -#define KVM_XEN_INVALID_GPA ((__u64)-1) - __u64 pad[8]; - struct { - __u64 state; - __u64 state_entry_time; - __u64 time_running; - __u64 time_runnable; - __u64 time_blocked; - __u64 time_offline; - } runstate; - __u32 vcpu_id; - struct { - __u32 port; - __u32 priority; - __u64 expires_ns; - } timer; - __u8 vector; - } u; -}; - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 -#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 -#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 - -/* Secure Encrypted Virtualization command */ -enum sev_cmd_id { - /* Guest initialization commands */ - KVM_SEV_INIT = 0, - KVM_SEV_ES_INIT, - /* Guest launch commands */ - KVM_SEV_LAUNCH_START, - KVM_SEV_LAUNCH_UPDATE_DATA, - KVM_SEV_LAUNCH_UPDATE_VMSA, - KVM_SEV_LAUNCH_SECRET, - KVM_SEV_LAUNCH_MEASURE, - KVM_SEV_LAUNCH_FINISH, - /* Guest migration commands (outgoing) */ - KVM_SEV_SEND_START, - KVM_SEV_SEND_UPDATE_DATA, - KVM_SEV_SEND_UPDATE_VMSA, - KVM_SEV_SEND_FINISH, - /* Guest migration commands (incoming) */ - KVM_SEV_RECEIVE_START, - KVM_SEV_RECEIVE_UPDATE_DATA, - KVM_SEV_RECEIVE_UPDATE_VMSA, - KVM_SEV_RECEIVE_FINISH, - /* Guest status and debug commands */ - KVM_SEV_GUEST_STATUS, - KVM_SEV_DBG_DECRYPT, - KVM_SEV_DBG_ENCRYPT, - /* Guest certificates commands */ - KVM_SEV_CERT_EXPORT, - /* Attestation report */ - KVM_SEV_GET_ATTESTATION_REPORT, - /* Guest Migration Extension */ - KVM_SEV_SEND_CANCEL, - - KVM_SEV_NR_MAX, -}; - -struct kvm_sev_cmd { - __u32 id; - __u64 data; - __u32 error; - __u32 sev_fd; -}; - -struct kvm_sev_launch_start { - __u32 handle; - __u32 policy; - __u64 dh_uaddr; - __u32 dh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_launch_update_data { - __u64 uaddr; - __u32 len; -}; - - -struct kvm_sev_launch_secret { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_launch_measure { - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_guest_status { - __u32 handle; - __u32 policy; - __u32 state; -}; - -struct kvm_sev_dbg { - __u64 src_uaddr; - __u64 dst_uaddr; - __u32 len; -}; - -struct kvm_sev_attestation_report { - __u8 mnonce[16]; - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_send_start { - __u32 policy; - __u64 pdh_cert_uaddr; - __u32 pdh_cert_len; - __u64 plat_certs_uaddr; - __u32 plat_certs_len; - __u64 amd_certs_uaddr; - __u32 amd_certs_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_send_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_receive_start { - __u32 handle; - __u32 policy; - __u64 pdh_uaddr; - __u32 pdh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_receive_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -#define KVM_DEV_IRQ_HOST_MASK 0x00ff -#define KVM_DEV_IRQ_GUEST_MASK 0xff00 - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; /* ignored (legacy field) */ - __u32 guest_irq; - __u32 flags; - union { - __u32 reserved[12]; - }; -}; - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -/* Available with KVM_CAP_ARM_USER_IRQ */ - -/* Bits for run->s.regs.device_irq_level */ -#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) -#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) -#define KVM_ARM_DEV_PMU (1 << 2) - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) @@ -2180,33 +1528,6 @@ struct kvm_stats_desc { /* Available with KVM_CAP_S390_ZPCI_OP */ #define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) -struct kvm_s390_zpci_op { - /* in */ - __u32 fh; /* target device */ - __u8 op; /* operation to perform */ - __u8 pad[3]; - union { - /* for KVM_S390_ZPCIOP_REG_AEN */ - struct { - __u64 ibv; /* Guest addr of interrupt bit vector */ - __u64 sb; /* Guest addr of summary bit */ - __u32 flags; - __u32 noi; /* Number of interrupts */ - __u8 isc; /* Guest interrupt subclass */ - __u8 sbo; /* Offset of guest summary bit vector */ - __u16 pad; - } reg_aen; - __u64 reserved[8]; - } u; -}; - -/* types for kvm_s390_zpci_op->op */ -#define KVM_S390_ZPCIOP_REG_AEN 0 -#define KVM_S390_ZPCIOP_DEREG_AEN 1 - -/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ -#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) - /* Available with KVM_CAP_MEMORY_ATTRIBUTES */ #define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) -- cgit v1.2.3 From b7ce17f257da17a4163da82e0fb7726c2de85da7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:15 -0700 Subject: tools/include: Sync uapi/sound/asound.h with the kernel sources To pick up the changes from: 85df6b5a6658 ("ALSA: pcm: clarify and fix default msbits value for all formats") This should be used to beautify sound syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux-sound@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-5-namhyung@kernel.org --- tools/include/uapi/sound/asound.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index d5b9cfbd9cea..628d46a0da92 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -416,7 +416,7 @@ struct snd_pcm_hw_params { unsigned int rmask; /* W: requested masks */ unsigned int cmask; /* R: changed masks */ unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits */ + unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ -- cgit v1.2.3 From 58e1b92df491c35abad7ddd2e393b89244e16bd5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:16 -0700 Subject: tools/include: Sync x86 CPU feature headers with the kernel sources To pick up the changes from: 598c2fafc06f ("perf/x86/amd/lbr: Use freeze based on availability") 7f274e609f3d ("x86/cpufeatures: Add new word for scattered features") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h diff -u tools/arch/x86/include/asm/required-features.h arch/x86/include/asm/required-features.h diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-6-namhyung@kernel.org --- tools/arch/x86/include/asm/cpufeatures.h | 17 ++++++++++++----- tools/arch/x86/include/asm/disabled-features.h | 11 +++++++++-- tools/arch/x86/include/asm/required-features.h | 3 ++- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 25160d26764b..a38f8f9ba657 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -81,10 +81,8 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -97,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ @@ -461,6 +459,14 @@ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ + /* * BUG word(s) */ @@ -508,4 +514,5 @@ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 1f23960d2b06..c492bdc97b05 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -123,6 +123,12 @@ # define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) #endif +#ifdef CONFIG_KVM_AMD_SEV +#define DISABLE_SEV_SNP 0 +#else +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -147,8 +153,9 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 0 +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 7ba1726b71c7..e9187ddd3d1f 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ -- cgit v1.2.3 From 978f2a60dd5ca6c25dfd5e24e7191b16af0ec429 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:17 -0700 Subject: tools/include: Sync x86 asm/irq_vectors.h with the kernel sources To pick up the changes from: 0cbca1bf44a0 ("x86: irq: unconditionally define KVM interrupt vectors") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/irq_vectors.h arch/x86/include/asm/irq_vectors.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-7-namhyung@kernel.org --- tools/arch/x86/include/asm/irq_vectors.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index 3f73ac3ed3a0..d18bfb238f66 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -84,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#if IS_ENABLED(CONFIG_KVM) #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -- cgit v1.2.3 From c781a72f9ddd09baddde9df1f59955c0d6ea4944 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:18 -0700 Subject: tools/include: Sync x86 asm/msr-index.h with the kernel sources To pick up the changes from: 8076fcde016c ("x86/rfds: Mitigate Register File Data Sampling (RFDS)") d7b69b590bc9 ("x86/sev: Dump SEV_STATUS") cd6df3f378f6 ("x86/cpu: Add MSR numbers for FRED configuration") 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-8-namhyung@kernel.org --- tools/arch/x86/include/asm/msr-index.h | 74 +++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1f9dc9bd13eb..05956bd8bacf 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -176,6 +176,14 @@ * CPU is not vulnerable to Gather * Data Sampling (GDS). */ +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR @@ -605,34 +613,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -719,8 +740,15 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 -- cgit v1.2.3 From 99e4e1174acd7f5a942d37e1ac6c115f870d5975 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:19 -0700 Subject: tools/include: Sync asm-generic/bitops/fls.h with the kernel sources To pick up the changes from: cb4ede926134 ("riscv: Avoid code duplication with generic bitops implementation") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/asm-generic/bitops/__fls.h include/asm-generic/bitops/__fls.h diff -u tools/include/asm-generic/bitops/fls.h include/asm-generic/bitops/fls.h Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Palmer Dabbelt Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-9-namhyung@kernel.org --- tools/include/asm-generic/bitops/__fls.h | 8 ++++++-- tools/include/asm-generic/bitops/fls.h | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index 03f721a8a2b1..54ccccf96e21 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -5,12 +5,12 @@ #include /** - * __fls - find last (most-significant) set bit in a long word + * generic___fls - find last (most-significant) set bit in a long word * @word: the word to search * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long generic___fls(unsigned long word) { int num = BITS_PER_LONG - 1; @@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word) return num; } +#ifndef __HAVE_ARCH___FLS +#define __fls(word) generic___fls(word) +#endif + #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */ diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index b168bb10e1be..26f3ce1dd6e4 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -3,14 +3,14 @@ #define _ASM_GENERIC_BITOPS_FLS_H_ /** - * fls - find last (most-significant) bit set + * generic_fls - find last (most-significant) bit set * @x: the word to search * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline int generic_fls(unsigned int x) { int r = 32; @@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x) return r; } +#ifndef __HAVE_ARCH_FLS +#define fls(x) generic_fls(x) +#endif + #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */ -- cgit v1.2.3 From 1cebd7f74976455ccd89c1dfbcf00bca52d0a512 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:20 -0700 Subject: tools/include: Sync arm64 asm/cputype.h with the kernel sources To pick up the changes from: fb091ff39479 ("arm64: Subscribe Microsoft Azure Cobalt 100 to ARM Neoverse N2 errata") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-10-namhyung@kernel.org --- tools/arch/arm64/include/asm/cputype.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7c7493cb571f..52f076afeb96 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -61,6 +61,7 @@ #define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_IMP_APPLE 0x61 #define ARM_CPU_IMP_AMPERE 0xC0 +#define ARM_CPU_IMP_MICROSOFT 0x6D #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -135,6 +136,8 @@ #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -193,6 +196,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX -- cgit v1.2.3 From 40e0ee6338f0c042c0dabe1f17eb76eac37b5425 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:41 -0700 Subject: KVM: selftests: Add coverage of EPT-disabled to vmx_dirty_log_test Extend vmx_dirty_log_test to include accesses made by L2 when EPT is disabled. This commit adds explicit coverage of a bug caught by syzkaller, where the TDP MMU would clear D-bits instead of write-protecting SPTEs being used to map an L2, which only happens when L1 does not enable EPT, causing writes made by L2 to not be reflected in the dirty log when PML is enabled: $ ./vmx_dirty_log_test Nested EPT: disabled ==== Test Assertion Failure ==== x86_64/vmx_dirty_log_test.c:151: test_bit(0, bmap) pid=72052 tid=72052 errno=4 - Interrupted system call (stack trace empty) Page 0 incorrectly reported clean Opportunistically replace the volatile casts with {READ,WRITE}_ONCE(). Link: https://lore.kernel.org/kvm/000000000000c6526f06137f18cc@google.com/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-5-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 60 +++++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 7f6f5f23fb9b..977948fd52e6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -28,16 +28,16 @@ #define NESTED_TEST_MEM1 0xc0001000 #define NESTED_TEST_MEM2 0xc0002000 -static void l2_guest_code(void) +static void l2_guest_code(u64 *a, u64 *b) { - *(volatile uint64_t *)NESTED_TEST_MEM1; - *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + READ_ONCE(*a); + WRITE_ONCE(*a, 1); GUEST_SYNC(true); GUEST_SYNC(false); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); @@ -45,17 +45,33 @@ static void l2_guest_code(void) vmcall(); } +static void l2_guest_code_ept_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_ept_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + void l1_guest_code(struct vmx_pages *vmx) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_ept_enabled; + else + l2_rip = l2_guest_code_ept_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(false); GUEST_ASSERT(!vmlaunch()); @@ -64,7 +80,7 @@ void l1_guest_code(struct vmx_pages *vmx) GUEST_DONE(); } -int main(int argc, char *argv[]) +static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; struct vmx_pages *vmx; @@ -76,8 +92,7 @@ int main(int argc, char *argv[]) struct ucall uc; bool done = false; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); @@ -103,11 +118,16 @@ int main(int argc, char *argv[]) * * Note that prepare_eptp should be called only L1's GPA map is done, * meaning after the last call to virt_map. + * + * When EPT is disabled, the L2 guest code will still access the same L1 + * GPAs as the EPT enabled case. */ - prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + if (enable_ept) { + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); @@ -148,3 +168,15 @@ int main(int argc, char *argv[]) } } } + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + test_vmx_dirty_log(/*enable_ept=*/false); + + if (kvm_cpu_has_ept()) + test_vmx_dirty_log(/*enable_ept=*/true); + + return 0; +} -- cgit v1.2.3 From 210cfef579260ed6c3b700e7baeae51a5e183f43 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 4 Apr 2024 17:02:09 -0500 Subject: selftests/powerpc/papr-vpd: Fix missing variable initialization The "close handle without consuming VPD" testcase has inconsistent results because it fails to initialize the location code object it passes to ioctl() to create a VPD handle. Initialize the location code to the empty string as intended. Signed-off-by: Nathan Lynch Fixes: 9118c5d32bdd ("powerpc/selftests: Add test for papr-vpd") Reported-by: Geetika Moolchandani Signed-off-by: Michael Ellerman Link: https://msgid.link/20240404-papr-vpd-test-uninit-lc-v2-1-37bff46c65a5@linux.ibm.com --- tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c index 505294da1b9f..d6f99eb9be65 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c +++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c @@ -154,7 +154,7 @@ static int dev_papr_vpd_null_handle(void) static int papr_vpd_close_handle_without_reading(void) { const int devfd = open(DEVPATH, O_RDONLY); - struct papr_location_code lc; + struct papr_location_code lc = { .str = "", }; int fd; SKIP_IF_MSG(devfd < 0 && errno == ENOENT, -- cgit v1.2.3 From e4a6bceac98eba3c00e874892736b34ea5fdaca3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:28 -0700 Subject: selftests: timers: Fix posix_timers ksft_print_msg() warning After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") the following warning occurs when building with an older gcc: posix_timers.c:250:2: warning: format not a string literal and no format arguments [-Wformat-security] 250 | ksft_print_msg(errmsg); | ^~~~~~~~~~~~~~ Fix this up by changing it to ksft_print_msg("%s", errmsg) Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Acked-by: Justin Stitt Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-1-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d86a0e00711e..348f47176e0a 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -247,7 +247,7 @@ static int check_timer_distribution(void) ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; err: - ksft_print_msg(errmsg); + ksft_print_msg("%s", errmsg); return -1; } -- cgit v1.2.3 From f7d5bcd35d427daac7e206b1073ca14f5db85c27 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:45:40 -0700 Subject: selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()"), clang warns: tools/testing/selftests/timers/../kselftest.h:398:6: warning: variable 'major' is used uninitialized whenever '||' condition is true [-Wsometimes-uninitialized] 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:401:9: note: uninitialized use occurs here 401 | return major > min_major || (major == min_major && minor >= min_minor); | ^~~~~ tools/testing/selftests/timers/../kselftest.h:398:6: note: remove the '||' if its condition is always false 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:395:20: note: initialize the variable 'major' to silence this warning 395 | unsigned int major, minor; | ^ | = 0 This is a false positive because if uname() fails, ksft_exit_fail_msg() will be called, which unconditionally calls exit(), a noreturn function. However, clang does not know that ksft_exit_fail_msg() will call exit() at the point in the pipeline that the warning is emitted because inlining has not occurred, so it assumes control flow will resume normally after ksft_exit_fail_msg() is called. Make it clear to clang that all of the functions that call exit() unconditionally in kselftest.h are noreturn transitively by marking them explicitly with '__attribute__((__noreturn__))', which clears up the warning above and any future warnings that may appear for the same reason. Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: John Stultz Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240411-mark-kselftest-exit-funcs-noreturn-v1-1-b027c948f586@kernel.org Closes: https://lore.kernel.org/all/20240410232637.4135564-2-jstultz@google.com/ --- tools/testing/selftests/kselftest.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 973b18e156b2..0591974b57e0 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -80,6 +80,9 @@ #define KSFT_XPASS 3 #define KSFT_SKIP 4 +#ifndef __noreturn +#define __noreturn __attribute__((__noreturn__)) +#endif #define __printf(a, b) __attribute__((format(printf, a, b))) /* counters */ @@ -300,13 +303,13 @@ void ksft_test_result_code(int exit_code, const char *test_name, va_end(args); } -static inline int ksft_exit_pass(void) +static inline __noreturn int ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline int ksft_exit_fail(void) +static inline __noreturn int ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -333,7 +336,7 @@ static inline int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -348,19 +351,19 @@ static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) exit(KSFT_FAIL); } -static inline int ksft_exit_xfail(void) +static inline __noreturn int ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline int ksft_exit_xpass(void) +static inline __noreturn int ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; -- cgit v1.2.3 From ed366de8ec89d4f960d66c85fc37d9de22f7bf6d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:30 -0700 Subject: selftests: timers: Fix abs() warning in posix_timers test Building with clang results in the following warning: posix_timers.c:69:6: warning: absolute value function 'abs' given an argument of type 'long long' but has parameter of type 'int' which may cause truncation of value [-Wabsolute-value] if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { ^ So switch to using llabs() instead. Fixes: 0bc4b0cf1570 ("selftests: add basic posix timers selftests") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-3-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 348f47176e0a..c001dd79179d 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -66,7 +66,7 @@ static int check_diff(struct timeval start, struct timeval end) diff = end.tv_usec - start.tv_usec; diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; - if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { printf("Diff too high: %lld..", diff); return -1; } -- cgit v1.2.3 From 16767502aa990cca2cb7d1372b31d328c4c85b40 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 12 Apr 2024 14:35:36 +0200 Subject: selftests: kselftest: Fix build failure with NOLIBC As Mark explains ksft_min_kernel_version() can't be compiled with nolibc, it doesn't implement uname(). Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: Mark Brown Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240412123536.GA32444@redhat.com Closes: https://lore.kernel.org/all/f0523b3a-ea08-4615-b0fb-5b504a2d39df@sirena.org.uk/ --- tools/testing/selftests/kselftest.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 0591974b57e0..7b89362da4bf 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -395,6 +395,10 @@ static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) static inline int ksft_min_kernel_version(unsigned int min_major, unsigned int min_minor) { +#ifdef NOLIBC + ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); + return 0; +#else unsigned int major, minor; struct utsname info; @@ -402,6 +406,7 @@ static inline int ksft_min_kernel_version(unsigned int min_major, ksft_exit_fail_msg("Can't parse kernel version\n"); return major > min_major || (major == min_major && minor >= min_minor); +#endif } #endif /* __KSELFTEST_H */ -- cgit v1.2.3 From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- tools/testing/selftests/net/udpgso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d089666..e398e1dbd2d3 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af3..85b3baa3f7f3 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 -- cgit v1.2.3 From 2760c51b8040d7cffedc337939e7475a17cc4b19 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 25 Mar 2024 14:00:48 +0500 Subject: iommufd: Add config needed for iommufd_fail_nth Add FAULT_INJECTION_DEBUG_FS and FAILSLAB configurations to the kconfig fragment for the iommfd selftests. These kconfigs are needed by the iommufd_fail_nth test. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240325090048.1423908-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config index 110d73917615..02a2a1b267c1 100644 --- a/tools/testing/selftests/iommu/config +++ b/tools/testing/selftests/iommu/config @@ -1,3 +1,5 @@ CONFIG_IOMMUFD=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_TEST=y +CONFIG_FAILSLAB=y -- cgit v1.2.3 From 4225dfa4535f219b03ae14147d9c6e7e82ec8df4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:52 +0100 Subject: selftests/tcp_ao: Make RST tests less flaky Currently, "active reset" cases are flaky, because select() is called for 3 sockets, while only 2 are expected to receive RST. The idea of the third socket was to get into request_sock_queue, but the test mistakenly attempted to connect() after the listener socket was shut down. Repair this test, it's important to check the different kernel code-paths for signing RST TCP-AO segments. Fixes: c6df7b2361d7 ("selftests/net: Add TCP-AO RST test") Reported-by: Jakub Kicinski Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/rst.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 7df8b8700e39..a2fe88d35ac0 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[], static void test_client_active_rst(unsigned int port) { - /* one in queue, another accept()ed */ - unsigned int wait_for = backlog + 2; int i, sk[3], err; bool is_writable[ARRAY_SIZE(sk)] = {false}; unsigned int last = ARRAY_SIZE(sk) - 1; @@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port) for (i = 0; i < last; i++) { err = _test_connect_socket(sk[i], this_ip_dest, port, (i == 0) ? TEST_TIMEOUT_SEC : -1); - if (err < 0) test_error("failed to connect()"); } - synchronize_threads(); /* 2: connection accept()ed, another queued */ - err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC); + synchronize_threads(); /* 2: two connections: one accept()ed, another queued */ + err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("test_wait_fds(): %d", err); + /* async connect() with third sk to get into request_sock_queue */ + err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + if (err < 0) + test_error("failed to connect()"); + synchronize_threads(); /* 3: close listen socket */ if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) test_fail("Failed to send data on connected socket"); @@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port) test_ok("Verified established tcp connection"); synchronize_threads(); /* 4: finishing up */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); - if (err < 0) - test_error("failed to connect()"); synchronize_threads(); /* 5: closed active sk */ - err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL, - wait_for, TEST_TIMEOUT_SEC); + /* + * Wait for 2 connections: one accepted, another in the accept queue, + * the one in request_sock_queue won't get fully established, so + * doesn't receive an active RST, see inet_csk_listen_stop(). + */ + err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("select(): %d", err); -- cgit v1.2.3 From b089b3bead532419cdcbd8e4e0a3e23c49d11573 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:53 +0100 Subject: selftests/tcp_ao: Zero-init tcp_ao_info_opt The structure is on the stack and has to be zero-initialized as the kernel checks for: > if (in.reserved != 0 || in.reserved2 != 0) > return -EINVAL; Fixes: b26660531cf6 ("selftests/net: Add test for TCP-AO add setsockopt() command") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/setsockopt-closed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 452de131fa3a..517930f9721b 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -21,7 +21,7 @@ static void make_listen(int sk) static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, const char *tst) { - struct tcp_ao_info_opt tmp; + struct tcp_ao_info_opt tmp = {}; socklen_t len = sizeof(tmp); if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len)) -- cgit v1.2.3 From beb78cd1329d039d73487ca05633d1b92e1ab2ea Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:54 +0100 Subject: selftests/tcp_ao: Fix fscanf() call for format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces: > lib/proc.c: In function ‘netstat_read_type’: > lib/proc.c:89:9: error: format not a string literal and no format arguments [-Werror=format-security] > 89 | if (fscanf(fnetstat, type->header_name) == EOF) > | ^~ > cc1: some warnings being treated as errors Here the selftests lib parses header name, while expectes non-space word ending with a column. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c index 2fb6dd8adba6..8b984fa04286 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/proc.c +++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c @@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line) pos = strchr(line, ' ') + 1; - if (fscanf(fnetstat, type->header_name) == EOF) + if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF) test_error("fscanf(%s)", type->header_name); if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':') test_error("Unexpected netstat format (%c)", tmp); -- cgit v1.2.3 From b476c93654d748c13624f7c7d0ba191c56a8092e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:55 +0100 Subject: selftests/tcp_ao: Printing fixes to confirm with format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces > lib/setup.c: In function ‘__test_msg’: > lib/setup.c:20:9: error: format not a string literal and no format arguments [-Werror=format-security] > 20 | ksft_print_msg(buf); > | ^~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_ok’: > lib/setup.c:26:9: error: format not a string literal and no format arguments [-Werror=format-security] > 26 | ksft_test_result_pass(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_fail’: > lib/setup.c:32:9: error: format not a string literal and no format arguments [-Werror=format-security] > 32 | ksft_test_result_fail(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_xfail’: > lib/setup.c:38:9: error: format not a string literal and no format arguments [-Werror=format-security] > 38 | ksft_test_result_xfail(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_error’: > lib/setup.c:44:9: error: format not a string literal and no format arguments [-Werror=format-security] > 44 | ksft_test_result_error(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_skip’: > lib/setup.c:50:9: error: format not a string literal and no format arguments [-Werror=format-security] > 50 | ksft_test_result_skip(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > cc1: some warnings being treated as errors As the buffer was already pre-printed into, print it as a string rather than a format-string. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 92276f916f2f..e408b9243b2c 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; void __test_msg(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_print_msg(buf); + ksft_print_msg("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_ok(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_pass(buf); + ksft_test_result_pass("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_fail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_fail(buf); + ksft_test_result_fail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_xfail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_xfail(buf); + ksft_test_result_xfail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_error(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_error(buf); + ksft_test_result_error("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_skip(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_skip(buf); + ksft_test_result_skip("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } -- cgit v1.2.3 From caed8eba221533123192d39cc947f45cbb1e1db5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Apr 2024 08:10:48 -0700 Subject: selftests: kselftest_harness: fix Clang warning about zero-length format Apparently it's more legal to pass the format as NULL, than it is to use an empty string. Clang complains about empty formats: ./../kselftest_harness.h:1207:30: warning: format string is empty [-Wformat-zero-length] 1207 | diagnostic ? "%s" : "", diagnostic); | ^~ 1 warning generated. Reported-by: Sean Christopherson Link: https://lore.kernel.org/all/20240409224256.1581292-1-seanjc@google.com Fixes: 378193eff339 ("selftests: kselftest_harness: let PASS / FAIL provide diagnostic") Tested-by: Sean Christopherson Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240416151048.1682352-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest.h | 10 ++++++---- tools/testing/selftests/kselftest_harness.h | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..4eca3fd1292c 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -288,15 +288,17 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /* Docs seem to call for double space if directive is absent */ - if (!directive[0] && msg[0]) + if (!directive[0] && msg) directive = " # "; - va_start(args, msg); printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive); errno = saved_errno; - vprintf(msg, args); + if (msg) { + va_start(args, msg); + vprintf(msg, args); + va_end(args); + } printf("\n"); - va_end(args); } static inline int ksft_exit_pass(void) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..adb15cae79ab 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1202,7 +1202,7 @@ void __run_test(struct __fixture_metadata *f, diagnostic = "unknown"; ksft_test_result_code(t->exit_code, test_name, - diagnostic ? "%s" : "", diagnostic); + diagnostic ? "%s" : NULL, diagnostic); } static int test_harness_run(int argc, char **argv) -- cgit v1.2.3 From e7bb43898bcf54da7ffb4819a04c8428f7db24db Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 16 Apr 2024 08:00:26 +0200 Subject: ASoC: dapm-graph: new tool to visualize DAPM state Add a tool to generate a picture of the current DAPM state for a sound card. dapm-graph is inspired by vizdapm which used to be published on a Wolfson Micro git repository now disappeared, and has a few forks around: https://github.com/mihais/asoc-tools https://github.com/alexandrebelloni/asoc-tools dapm-graph is a full reimplementation with several improvements while still being a self-contained shell script: Improvements to rendered output: - shows the entire card, not one component hierarchy only - each component is rendered in a separate box - shows widget on/off status based on widget information alone (the original vizdapm propagates the "on" green colour to the first input widget) - use bold line and gray background and not only green/red line to show on/off status (for the color blind) Improvements for embedded system developers: - remote mode: get state of remote device (possibly with minimal rootfs) via SSH, but parsing locally for faster operation - compatible with BusyBox shell, not only bash Usability improvements: - flexible command line (uses getopts for parsing) - detailed help text - flag to enable detailed debug logging - graphviz output format detected from file extension, not hard coded - a self-contained shell script Usage is designed to be simple: dapm-grpah -c CARD - get state from debugfs for CARD dapm-grpah -c CARD -r REMOTE_TARGET - same, but remotely via SSH dapm-grpah -d STATE_DIR - from a local copy of the debugfs tree for a card Signed-off-by: Luca Ceresoli Reviewed-by: Alexandre Belloni Link: https://lore.kernel.org/r/20240416-vizdapm-ng-v1-3-5d33c0b57bc5@bootlin.com Signed-off-by: Mark Brown --- MAINTAINERS | 6 + tools/sound/dapm-graph | 303 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100755 tools/sound/dapm-graph (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index a7d0dd91ac19..aa3b455588e1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20669,6 +20669,12 @@ F: include/trace/events/sof*.h F: include/uapi/sound/asoc.h F: sound/soc/ +SOUND - SOC LAYER / dapm-graph +M: Luca Ceresoli +L: linux-sound@vger.kernel.org +S: Maintained +F: tools/sound/dapm-graph + SOUND - SOUND OPEN FIRMWARE (SOF) DRIVERS M: Pierre-Louis Bossart M: Liam Girdwood diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph new file mode 100755 index 000000000000..57d78f6df041 --- /dev/null +++ b/tools/sound/dapm-graph @@ -0,0 +1,303 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate a graph of the current DAPM state for an audio card +# +# Copyright 2024 Bootlin +# Author: Luca Ceresoli + +set -eu + +STYLE_NODE_ON="shape=box,style=bold,color=green4" +STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95" + +# Print usage and exit +# +# $1 = exit return value +# $2 = error string (required if $1 != 0) +usage() +{ + if [ "${1}" -ne 0 ]; then + echo "${2}" >&2 + fi + + echo " +Generate a graph of the current DAPM state for an audio card. + +The DAPM state can be obtained via debugfs for a card on the local host or +a remote target, or from a local copy of the debugfs tree for the card. + +Usage: + $(basename $0) [options] -c CARD - Local sound card + $(basename $0) [options] -c CARD -r REMOTE_TARGET - Card on remote system + $(basename $0) [options] -d STATE_DIR - Local directory + +Options: + -c CARD Sound card to get DAPM state of + -r REMOTE_TARGET Get DAPM state from REMOTE_TARGET via SSH and SCP + instead of using a local sound card + -d STATE_DIR Get DAPM state from a local copy of a debugfs tree + -o OUT_FILE Output file (default: dapm.dot) + -D Show verbose debugging info + -h Print this help and exit + +The output format is implied by the extension of OUT_FILE: + + * Use the .dot extension to generate a text graph representation in + graphviz dot syntax. + * Any other extension is assumed to be a format supported by graphviz for + rendering, e.g. 'png', 'svg', and will produce both the .dot file and a + picture from it. This requires the 'dot' program from the graphviz + package. +" + + exit ${1} +} + +# Connect to a remote target via SSH, collect all DAPM files from debufs +# into a tarball and get the tarball via SCP into $3/dapm.tar +# +# $1 = target as used by ssh and scp, e.g. "root@192.168.1.1" +# $2 = sound card name +# $3 = temp dir path (present on the host, created on the target) +# $4 = local directory to extract the tarball into +# +# Requires an ssh+scp server, find and tar+gz on the target +# +# Note: the tarball is needed because plain 'scp -r' from debugfs would +# copy only empty files +grab_remote_files() +{ + echo "Collecting DAPM state from ${1}" + dbg_echo "Collected DAPM state in ${3}" + + ssh "${1}" " +set -eu && +cd \"/sys/kernel/debug/asoc/${2}\" && +find * -type d -exec mkdir -p ${3}/dapm-tree/{} \; && +find * -type f -exec cp \"{}\" \"${3}/dapm-tree/{}\" \; && +cd ${3}/dapm-tree && +tar cf ${3}/dapm.tar ." + scp -q "${1}:${3}/dapm.tar" "${3}" + + mkdir -p "${4}" + tar xf "${tmp_dir}/dapm.tar" -C "${4}" +} + +# Parse a widget file and generate graph description in graphviz dot format +# +# Skips any file named "bias_level". +# +# $1 = temporary work dir +# $2 = component name +# $3 = widget filename +process_dapm_widget() +{ + local tmp_dir="${1}" + local c_name="${2}" + local w_file="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + local w_name="$(basename "${w_file}")" + local w_tag="${c_name}_${w_name}" + + if [ "${w_name}" = "bias_level" ]; then + return 0 + fi + + dbg_echo " + Widget: ${w_name}" + + cat "${w_file}" | ( + read line + + if echo "${line}" | grep -q ': On ' + then local node_style="${STYLE_NODE_ON}" + else local node_style="${STYLE_NODE_OFF}" + fi + + local w_type="" + while read line; do + # Collect widget type if present + if echo "${line}" | grep -q '^widget-type '; then + local w_type_raw="$(echo "$line" | cut -d ' ' -f 2)" + dbg_echo " - Widget type: ${w_type_raw}" + + # Note: escaping '\n' is tricky to get working with both + # bash and busybox ash, so use a '%' here and replace it + # later + local w_type="%n[${w_type_raw}]" + fi + + # Collect any links. We could use "in" links or "out" links, + # let's use "in" links + if echo "${line}" | grep -q '^in '; then + local w_src=$(echo "$line" | + awk -F\" '{print $6 "_" $4}' | + sed 's/^(null)_/ROOT_/') + dbg_echo " - Input route from: ${w_src}" + echo " \"${w_src}\" -> \"$w_tag\"" >> "${links_file}" + fi + done + + echo " \"${w_tag}\" [label=\"${w_name}${w_type}\",${node_style}]" | + tr '%' '\\' >> "${dot_file}" + ) +} + +# Parse the DAPM tree for a sound card component and generate graph +# description in graphviz dot format +# +# $1 = temporary work dir +# $2 = component directory +# $3 = forced component name (extracted for path if empty) +process_dapm_component() +{ + local tmp_dir="${1}" + local c_dir="${2}" + local c_name="${3}" + local dot_file="${tmp_dir}/main.dot" + local links_file="${tmp_dir}/links.dot" + + if [ -z "${c_name}" ]; then + # Extract directory name into component name: + # "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a" + c_name="$(basename $(dirname "${c_dir}"))" + fi + + dbg_echo " * Component: ${c_name}" + + echo "" >> "${dot_file}" + echo " subgraph \"${c_name}\" {" >> "${dot_file}" + echo " cluster = true" >> "${dot_file}" + echo " label = \"${c_name}\"" >> "${dot_file}" + echo " color=dodgerblue" >> "${dot_file}" + + # Create empty file to ensure it will exist in all cases + >"${links_file}" + + # Iterate over widgets in the component dir + for w_file in ${c_dir}/*; do + process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}" + done + + echo " }" >> "${dot_file}" + + cat "${links_file}" >> "${dot_file}" +} + +# Parse the DAPM tree for a sound card and generate graph description in +# graphviz dot format +# +# $1 = temporary work dir +# $2 = directory tree with DAPM state (either in debugfs or a mirror) +process_dapm_tree() +{ + local tmp_dir="${1}" + local dapm_dir="${2}" + local dot_file="${tmp_dir}/main.dot" + + echo "digraph G {" > "${dot_file}" + echo " fontname=\"sans-serif\"" >> "${dot_file}" + echo " node [fontname=\"sans-serif\"]" >> "${dot_file}" + + + # Process root directory (no component) + process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT" + + # Iterate over components + for c_dir in "${dapm_dir}"/*/dapm + do + process_dapm_component "${tmp_dir}" "${c_dir}" "" + done + + echo "}" >> "${dot_file}" +} + +main() +{ + # Parse command line + local out_file="dapm.dot" + local card_name="" + local remote_target="" + local dapm_tree="" + local dbg_on="" + while getopts "c:r:d:o:Dh" arg; do + case $arg in + c) card_name="${OPTARG}" ;; + r) remote_target="${OPTARG}" ;; + d) dapm_tree="${OPTARG}" ;; + o) out_file="${OPTARG}" ;; + D) dbg_on="1" ;; + h) usage 0 ;; + *) usage 1 ;; + esac + done + shift $(($OPTIND - 1)) + + if [ -n "${dapm_tree}" ]; then + if [ -n "${card_name}${remote_target}" ]; then + usage 1 "Cannot use -c and -r with -d" + fi + echo "Using local tree: ${dapm_tree}" + elif [ -n "${remote_target}" ]; then + if [ -z "${card_name}" ]; then + usage 1 "-r requires -c" + fi + echo "Using card ${card_name} from remote target ${remote_target}" + elif [ -n "${card_name}" ]; then + echo "Using local card: ${card_name}" + else + usage 1 "Please choose mode using -c, -r or -d" + fi + + # Define logging function + if [ "${dbg_on}" ]; then + dbg_echo() { + echo "$*" >&2 + } + else + dbg_echo() { + : + } + fi + + # Filename must have a dot in order the infer the format from the + # extension + if ! echo "${out_file}" | grep -qE '\.'; then + echo "Missing extension in output filename ${out_file}" >&2 + usage + exit 1 + fi + + local out_fmt="${out_file##*.}" + local dot_file="${out_file%.*}.dot" + + dbg_echo "dot file: $dot_file" + dbg_echo "Output file: $out_file" + dbg_echo "Output format: $out_fmt" + + tmp_dir="$(mktemp -d /tmp/$(basename $0).XXXXXX)" + trap "{ rm -fr ${tmp_dir}; }" INT TERM EXIT + + if [ -z "${dapm_tree}" ] + then + dapm_tree="/sys/kernel/debug/asoc/${card_name}" + fi + if [ -n "${remote_target}" ]; then + dapm_tree="${tmp_dir}/dapm-tree" + grab_remote_files "${remote_target}" "${card_name}" "${tmp_dir}" "${dapm_tree}" + fi + # In all cases now ${dapm_tree} contains the DAPM state + + process_dapm_tree "${tmp_dir}" "${dapm_tree}" + cp "${tmp_dir}/main.dot" "${dot_file}" + + if [ "${out_file}" != "${dot_file}" ]; then + dot -T"${out_fmt}" "${dot_file}" -o "${out_file}" + fi + + echo "Generated file ${out_file}" +} + +main "${@}" -- cgit v1.2.3 From a44f2eb106a46f2275a79de54ce0ea63e4f3d8c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2024 19:08:26 -0700 Subject: tools: ynl: don't ignore errors in NLMSG_DONE messages NLMSG_DONE contains an error code, it has to be extracted. Prior to this change all dumps will end in success, and in case of failure the result is silently truncated. Fixes: e4b48ed460d3 ("tools: ynl: add a completely generic client") Signed-off-by: Jakub Kicinski Reviewed-by: Donald Hunter Link: https://lore.kernel.org/r/20240420020827.3288615-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/lib/ynl.py | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 5fa7957f6e0f..25810e18b0a7 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -182,6 +182,7 @@ class NlMsg: self.done = 1 extack_off = 20 elif self.nl_type == Netlink.NLMSG_DONE: + self.error = struct.unpack("i", self.raw[0:4])[0] self.done = 1 extack_off = 4 -- cgit v1.2.3