summaryrefslogtreecommitdiff
path: root/tools/perf/util/event.c
diff options
context:
space:
mode:
authorKan Liang <kan.liang@intel.com>2017-09-29 17:47:54 +0300
committerArnaldo Carvalho de Melo <acme@redhat.com>2017-10-03 15:27:46 +0300
commit340b47f510bbe55a76b7309107276f02ea11f117 (patch)
tree4dde66840cd57b47c01f7c350ad545347f4e2178 /tools/perf/util/event.c
parentf988e71bc6220d8b404dbd43c0e0962e30305795 (diff)
downloadlinux-340b47f510bbe55a76b7309107276f02ea11f117.tar.xz
perf top: Implement multithreading for perf_event__synthesize_threads
The proc files which is sorted with alphabetical order are evenly assigned to several synthesize threads to be processed in parallel. For 'perf top', the threads number hard code to online CPU number. The following patch will introduce an option to set it. For other perf tools, the thread number is 1. Because the process function is not ready for multithreading, e.g. process_synthesized_event. This patch series only support event synthesize multithreading for 'perf top'. For other tools, it can be done separately later. With multithread applied, the total processing time can get up to 1.56x speedup on Knights Mill for 'perf top'. For specific single event processing, the processing time could increase because of the lock contention. So proc_map_timeout may need to be increased. Otherwise some proc maps will be truncated. Based on my test, increasing the proc_map_timeout has small impact on the total processing time. The total processing time still get 1.49x speedup on Knights Mill after increasing the proc_map_timeout. The patch itself doesn't increase the proc_map_timeout. Doesn't need to implement multithreading for per task monitoring, perf_event__synthesize_thread_map. It doesn't have performance issue. Committer testing: # getconf _NPROCESSORS_ONLN 4 # perf trace --no-inherit -e clone -o /tmp/output perf top # tail -4 /tmp/bla 0.124 ( 0.041 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eb3a8f30, parent_tidptr: 0x7fc3eb3a99d0, child_tidptr: 0x7fc3eb3a99d0, tls: 0x7fc3eb3a9700) = 9548 (perf) 0.246 ( 0.023 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eaba7f30, parent_tidptr: 0x7fc3eaba89d0, child_tidptr: 0x7fc3eaba89d0, tls: 0x7fc3eaba8700) = 9549 (perf) 0.286 ( 0.019 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9550 (perf) 246.540 ( 0.047 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9551 (perf) # Signed-off-by: Kan Liang <kan.liang@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andi Kleen <ak@linux.intel.com> Cc: He Kuang <hekuang@huawei.com> Cc: Lukasz Odzioba <lukasz.odzioba@intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/r/1506696477-146932-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/util/event.c')
-rw-r--r--tools/perf/util/event.c160
1 files changed, 129 insertions, 31 deletions
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 10366b87d0b5..0e678dd6bdbe 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -678,23 +678,21 @@ out:
return err;
}
-int perf_event__synthesize_threads(struct perf_tool *tool,
- perf_event__handler_t process,
- struct machine *machine,
- bool mmap_data,
- unsigned int proc_map_timeout)
+static int __perf_event__synthesize_threads(struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine,
+ bool mmap_data,
+ unsigned int proc_map_timeout,
+ struct dirent **dirent,
+ int start,
+ int num)
{
union perf_event *comm_event, *mmap_event, *fork_event;
union perf_event *namespaces_event;
- char proc_path[PATH_MAX];
- struct dirent **dirent;
int err = -1;
char *end;
pid_t pid;
- int n, i;
-
- if (machine__is_default_guest(machine))
- return 0;
+ int i;
comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
if (comm_event == NULL)
@@ -714,34 +712,25 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
if (namespaces_event == NULL)
goto out_free_fork;
- snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
- n = scandir(proc_path, &dirent, 0, alphasort);
-
- if (n < 0)
- goto out_free_namespaces;
-
- for (i = 0; i < n; i++) {
+ for (i = start; i < start + num; i++) {
if (!isdigit(dirent[i]->d_name[0]))
continue;
pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
/* only interested in proper numerical dirents */
- if (!*end) {
- /*
- * We may race with exiting thread, so don't stop just because
- * one thread couldn't be synthesized.
- */
- __event__synthesize_thread(comm_event, mmap_event, fork_event,
- namespaces_event, pid, 1, process,
- tool, machine, mmap_data,
- proc_map_timeout);
- }
- free(dirent[i]);
+ if (*end)
+ continue;
+ /*
+ * We may race with exiting thread, so don't stop just because
+ * one thread couldn't be synthesized.
+ */
+ __event__synthesize_thread(comm_event, mmap_event, fork_event,
+ namespaces_event, pid, 1, process,
+ tool, machine, mmap_data,
+ proc_map_timeout);
}
- free(dirent);
err = 0;
-out_free_namespaces:
free(namespaces_event);
out_free_fork:
free(fork_event);
@@ -753,6 +742,115 @@ out:
return err;
}
+struct synthesize_threads_arg {
+ struct perf_tool *tool;
+ perf_event__handler_t process;
+ struct machine *machine;
+ bool mmap_data;
+ unsigned int proc_map_timeout;
+ struct dirent **dirent;
+ int num;
+ int start;
+};
+
+static void *synthesize_threads_worker(void *arg)
+{
+ struct synthesize_threads_arg *args = arg;
+
+ __perf_event__synthesize_threads(args->tool, args->process,
+ args->machine, args->mmap_data,
+ args->proc_map_timeout, args->dirent,
+ args->start, args->num);
+ return NULL;
+}
+
+int perf_event__synthesize_threads(struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine,
+ bool mmap_data,
+ unsigned int proc_map_timeout,
+ unsigned int nr_threads_synthesize)
+{
+ struct synthesize_threads_arg *args = NULL;
+ pthread_t *synthesize_threads = NULL;
+ char proc_path[PATH_MAX];
+ struct dirent **dirent;
+ int num_per_thread;
+ int m, n, i, j;
+ int thread_nr;
+ int base = 0;
+ int err = -1;
+
+
+ if (machine__is_default_guest(machine))
+ return 0;
+
+ snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
+ n = scandir(proc_path, &dirent, 0, alphasort);
+ if (n < 0)
+ return err;
+
+ thread_nr = nr_threads_synthesize;
+
+ if (thread_nr <= 1) {
+ err = __perf_event__synthesize_threads(tool, process,
+ machine, mmap_data,
+ proc_map_timeout,
+ dirent, base, n);
+ goto free_dirent;
+ }
+ if (thread_nr > n)
+ thread_nr = n;
+
+ synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
+ if (synthesize_threads == NULL)
+ goto free_dirent;
+
+ args = calloc(sizeof(*args), thread_nr);
+ if (args == NULL)
+ goto free_threads;
+
+ num_per_thread = n / thread_nr;
+ m = n % thread_nr;
+ for (i = 0; i < thread_nr; i++) {
+ args[i].tool = tool;
+ args[i].process = process;
+ args[i].machine = machine;
+ args[i].mmap_data = mmap_data;
+ args[i].proc_map_timeout = proc_map_timeout;
+ args[i].dirent = dirent;
+ }
+ for (i = 0; i < m; i++) {
+ args[i].num = num_per_thread + 1;
+ args[i].start = i * args[i].num;
+ }
+ if (i != 0)
+ base = args[i-1].start + args[i-1].num;
+ for (j = i; j < thread_nr; j++) {
+ args[j].num = num_per_thread;
+ args[j].start = base + (j - i) * args[i].num;
+ }
+
+ for (i = 0; i < thread_nr; i++) {
+ if (pthread_create(&synthesize_threads[i], NULL,
+ synthesize_threads_worker, &args[i]))
+ goto out_join;
+ }
+ err = 0;
+out_join:
+ for (i = 0; i < thread_nr; i++)
+ pthread_join(synthesize_threads[i], NULL);
+ free(args);
+free_threads:
+ free(synthesize_threads);
+free_dirent:
+ for (i = 0; i < n; i++)
+ free(dirent[i]);
+ free(dirent);
+
+ return err;
+}
+
struct process_symbol_args {
const char *name;
u64 start;