diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-09-05 21:56:18 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-09-05 21:56:18 +0300 |
commit | 27151f177827d478508e756c7657273261aaf8a9 (patch) | |
tree | 393ee6f1b12842c87461c471c0da70e4ecd93da8 /tools/perf/bench/futex-wake-parallel.c | |
parent | 58ca24158758f1784400d32743373d7d6227d018 (diff) | |
parent | c7a3828d98db2730079265b5f51933dfcef8bb5f (diff) | |
download | linux-27151f177827d478508e756c7657273261aaf8a9.tar.xz |
Merge tag 'perf-tools-for-v5.15-2021-09-04' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux
Pull perf tool updates from Arnaldo Carvalho de Melo:
"New features:
- Improvements for the flamegraph python script, including:
- Display perf.data header
- Display PIDs of user stacks
- Added option to change color scheme
- Default to blue/green color scheme to improve accessibility
- Correctly identify kernel stacks when debuginfo is available
- Improvements for 'perf bench futex':
- Add --mlockall parameter
- Add --broadcast and --pi to the 'requeue' sub benchmark
- Add support for PMU aliases.
- Introduce an ARM Coresight ETE decoder.
- Add a 'perf bench' entry for evlist open/close operations, to help
quantify improvements with multithreading 'perf record'.
- Allow reporting the [un]throttle PERF_RECORD_ meta event in 'perf
script's python scripting.
- Add a 'perf test' entry for PMU aliases.
- Add a 'perf test' entry for 'perf record/perf report/perf script'
pipe mode.
Fixes:
- perf script dlfilter (API for filtering via dynamically loaded
shared object introduced in v5.14) fixes and a 'perf test' entry
for it.
- Fix get_current_dir_name() compilation on Android.
- Fix issues with asciidoc and double dashes uses.
- Fix memory leaks in the BTF handling code.
- Fix leftover problems in the Documentation from the infrastructure
originally lifted from the git codebase.
- Fix *probe_vfs_getname.sh 'perf test' failures.
- Handle fd gaps in 'perf test's test__dso_data_reopen().
- Make sure to show disasembly warnings for 'perf annotate --stdio'.
- Fix output from pipe to file and vice-versa in 'perf
record/report/script'.
- Correct 'perf data -h' output.
- Fix wrong comm in system-wide mode with 'perf record --delay'.
- Do not allow --for-each-cgroup without cpu in 'perf stat'
- Make 'perf test --skip' work on shell tests.
- Fix libperf's verbose printing.
Misc improvements:
- Preparatory patches for multithreading various 'perf record' phases
(synthesizing, opening, recording, etc).
- Add sparse context/locking annotations in compiler-types.h, also to
help with the multithreading effort.
- Optimize the generation of the arch specific erno tables used in
'perf trace'.
- Optimize libperf's perf_cpu_map__max().
- Improve ARM's CoreSight warnings.
- Report collisions in AUX records.
- Improve warnings for the LLVM 'perf test' entry.
- Improve the PMU events 'perf test' codebase.
- perf test: Do not compare overheads in the zstd comp test
- Better support annotation on ARM.
- Update 'perf trace's cmd string table to decode sys_bpf() first
arg.
Vendor events:
- Add JSON events and metrics for Intel's Ice Lake, Tiger Lake and
Elhart Lake.
- Update JSON eventsand metrics for Intel's Cascade Lake and Sky Lake
servers.
Hardware tracing:
- Improvements for the ARM hardware tracing auxtrace support"
* tag 'perf-tools-for-v5.15-2021-09-04' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (130 commits)
perf tests: Add test for PMU aliases
perf pmu: Add PMU alias support
perf session: Report collisions in AUX records
perf script python: Allow reporting the [un]throttle PERF_RECORD_ meta event
perf build: Report failure for testing feature libopencsd
perf cs-etm: Show a warning for an unknown magic number
perf cs-etm: Print the decoder name
perf cs-etm: Create ETE decoder
perf cs-etm: Update OpenCSD decoder for ETE
perf cs-etm: Fix typo
perf cs-etm: Save TRCDEVARCH register
perf cs-etm: Refactor out ETMv4 header saving
perf cs-etm: Initialise architecture based on TRCIDR1
perf cs-etm: Refactor initialisation of decoder params.
tools build: Fix feature detect clean for out of source builds
perf evlist: Add evlist__for_each_entry_from() macro
perf evsel: Handle precise_ip fallback in evsel__open_cpu()
perf evsel: Move bpf_counter__install_pe() to success path in evsel__open_cpu()
perf evsel: Move test_attr__open() to success path in evsel__open_cpu()
perf evsel: Move ignore_missing_thread() to fallback code
...
Diffstat (limited to 'tools/perf/bench/futex-wake-parallel.c')
-rw-r--r-- | tools/perf/bench/futex-wake-parallel.c | 66 |
1 files changed, 38 insertions, 28 deletions
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 6e6f5247e1fe..e970e6b9ad53 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -34,6 +34,7 @@ int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe #include <err.h> #include <stdlib.h> #include <sys/time.h> +#include <sys/mman.h> struct thread_data { pthread_t worker; @@ -47,8 +48,7 @@ static unsigned int nwakes = 1; static u_int32_t futex = 0; static pthread_t *blocked_worker; -static bool done = false, silent = false, fshared = false; -static unsigned int nblocked_threads = 0, nwaking_threads = 0; +static bool done = false; static pthread_mutex_t thread_lock; static pthread_cond_t thread_parent, thread_worker; static pthread_barrier_t barrier; @@ -56,11 +56,15 @@ static struct stats waketime_stats, wakeup_stats; static unsigned int threads_starting; static int futex_flag = 0; +static struct bench_futex_parameters params; + static const struct option options[] = { - OPT_UINTEGER('t', "threads", &nblocked_threads, "Specify amount of threads"), - OPT_UINTEGER('w', "nwakers", &nwaking_threads, "Specify amount of waking threads"), - OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), - OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakers", ¶ms.nwakes, "Specify amount of waking threads"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_END() }; @@ -96,10 +100,10 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE); - pthread_barrier_init(&barrier, NULL, nwaking_threads + 1); + pthread_barrier_init(&barrier, NULL, params.nwakes + 1); /* create and block all threads */ - for (i = 0; i < nwaking_threads; i++) { + for (i = 0; i < params.nwakes; i++) { /* * Thread creation order will impact per-thread latency * as it will affect the order to acquire the hb spinlock. @@ -112,7 +116,7 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) pthread_barrier_wait(&barrier); - for (i = 0; i < nwaking_threads; i++) + for (i = 0; i < params.nwakes; i++) if (pthread_join(td[i].worker, NULL)) err(EXIT_FAILURE, "pthread_join"); @@ -143,10 +147,10 @@ static void block_threads(pthread_t *w, pthread_attr_t thread_attr, cpu_set_t cpuset; unsigned int i; - threads_starting = nblocked_threads; + threads_starting = params.nthreads; /* create and block all threads */ - for (i = 0; i < nblocked_threads; i++) { + for (i = 0; i < params.nthreads; i++) { CPU_ZERO(&cpuset); CPU_SET(cpu->map[i % cpu->nr], &cpuset); @@ -167,7 +171,7 @@ static void print_run(struct thread_data *waking_worker, unsigned int run_num) init_stats(&__wakeup_stats); init_stats(&__waketime_stats); - for (i = 0; i < nwaking_threads; i++) { + for (i = 0; i < params.nwakes; i++) { update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec); update_stats(&__wakeup_stats, waking_worker[i].nwoken); } @@ -178,7 +182,7 @@ static void print_run(struct thread_data *waking_worker, unsigned int run_num) printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) " "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg, - nblocked_threads, waketime_avg / USEC_PER_MSEC, + params.nthreads, waketime_avg / USEC_PER_MSEC, rel_stddev_stats(waketime_stddev, waketime_avg)); } @@ -193,7 +197,7 @@ static void print_summary(void) printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n", wakeup_avg, - nblocked_threads, + params.nthreads, waketime_avg / USEC_PER_MSEC, rel_stddev_stats(waketime_stddev, waketime_avg)); } @@ -203,7 +207,7 @@ static void do_run_stats(struct thread_data *waking_worker) { unsigned int i; - for (i = 0; i < nwaking_threads; i++) { + for (i = 0; i < params.nwakes; i++) { update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec); update_stats(&wakeup_stats, waking_worker[i].nwoken); } @@ -238,36 +242,42 @@ int bench_futex_wake_parallel(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + cpu = perf_cpu_map__new(NULL); if (!cpu) err(EXIT_FAILURE, "calloc"); - if (!nblocked_threads) - nblocked_threads = cpu->nr; + if (!params.nthreads) + params.nthreads = cpu->nr; /* some sanity checks */ - if (nwaking_threads > nblocked_threads || !nwaking_threads) - nwaking_threads = nblocked_threads; + if (params.nwakes > params.nthreads || + !params.nwakes) + params.nwakes = params.nthreads; - if (nblocked_threads % nwaking_threads) + if (params.nthreads % params.nwakes) errx(EXIT_FAILURE, "Must be perfectly divisible"); /* * Each thread will wakeup nwakes tasks in * a single futex_wait call. */ - nwakes = nblocked_threads/nwaking_threads; + nwakes = params.nthreads/params.nwakes; - blocked_worker = calloc(nblocked_threads, sizeof(*blocked_worker)); + blocked_worker = calloc(params.nthreads, sizeof(*blocked_worker)); if (!blocked_worker) err(EXIT_FAILURE, "calloc"); - if (!fshared) + if (!params.fshared) futex_flag = FUTEX_PRIVATE_FLAG; printf("Run summary [PID %d]: blocking on %d threads (at [%s] " "futex %p), %d threads waking up %d at a time.\n\n", - getpid(), nblocked_threads, fshared ? "shared":"private", - &futex, nwaking_threads, nwakes); + getpid(), params.nthreads, params.fshared ? "shared":"private", + &futex, params.nwakes, nwakes); init_stats(&wakeup_stats); init_stats(&waketime_stats); @@ -278,7 +288,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) pthread_cond_init(&thread_worker, NULL); for (j = 0; j < bench_repeat && !done; j++) { - waking_worker = calloc(nwaking_threads, sizeof(*waking_worker)); + waking_worker = calloc(params.nwakes, sizeof(*waking_worker)); if (!waking_worker) err(EXIT_FAILURE, "calloc"); @@ -297,14 +307,14 @@ int bench_futex_wake_parallel(int argc, const char **argv) /* Ok, all threads are patiently blocked, start waking folks up */ wakeup_threads(waking_worker, thread_attr); - for (i = 0; i < nblocked_threads; i++) { + for (i = 0; i < params.nthreads; i++) { ret = pthread_join(blocked_worker[i], NULL); if (ret) err(EXIT_FAILURE, "pthread_join"); } do_run_stats(waking_worker); - if (!silent) + if (!params.silent) print_run(waking_worker, j); free(waking_worker); |