summaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c218
1 files changed, 186 insertions, 32 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e1459df73043..bc9b98a9af9a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -28,6 +28,7 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
@@ -387,6 +388,7 @@ static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
+static atomic_t nr_cgroup_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -981,16 +983,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-/*
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
- * cleared when last cgroup event is removed.
- */
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
- struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -1007,28 +1003,41 @@ list_update_cgroup_event(struct perf_event *event,
* because if the first would mismatch, the second would not try again
* and we would leave cpuctx->cgrp unset.
*/
- if (add && !cpuctx->cgrp) {
+ if (ctx->is_active && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
}
- if (add && ctx->nr_cgroups++)
+ if (ctx->nr_cgroups++)
return;
- else if (!add && --ctx->nr_cgroups)
+
+ list_add(&cpuctx->cgrp_cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+}
+
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
return;
- /* no cgroup running */
- if (!add)
+ /*
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
+ */
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (--ctx->nr_cgroups)
+ return;
+
+ if (ctx->is_active && cpuctx->cgrp)
cpuctx->cgrp = NULL;
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- if (add)
- list_add(cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
- else
- list_del(cpuctx_entry);
+ list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1094,11 +1103,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+}
#endif
/*
@@ -1789,13 +1801,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
add_event_to_groups(event, ctx);
}
- list_update_cgroup_event(event, ctx, true);
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_enable(event, ctx);
+
ctx->generation++;
}
@@ -1861,6 +1874,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ size += sizeof(data->cgroup);
+
event->header_size = size;
}
@@ -1971,8 +1987,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- list_update_cgroup_event(event, ctx, false);
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1989,8 +2003,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of error state is by explicit re-enabling
* of the event
*/
- if (event->state > PERF_EVENT_STATE_OFF)
+ if (event->state > PERF_EVENT_STATE_OFF) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ }
ctx->generation++;
}
@@ -2221,6 +2237,7 @@ event_sched_out(struct perf_event *event,
if (READ_ONCE(event->pending_disable) >= 0) {
WRITE_ONCE(event->pending_disable, -1);
+ perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2358,6 +2375,7 @@ static void __perf_event_disable(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ perf_cgroup_event_disable(event, ctx);
}
/*
@@ -2741,7 +2759,7 @@ static int __perf_install_in_context(void *info)
}
#ifdef CONFIG_CGROUP_PERF
- if (is_cgroup_event(event)) {
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2901,6 +2919,7 @@ static void __perf_event_enable(struct perf_event *event,
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
return;
@@ -3503,7 +3522,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
static bool perf_less_group_idx(const void *l, const void *r)
{
- const struct perf_event *le = l, *re = r;
+ const struct perf_event *le = *(const struct perf_event **)l;
+ const struct perf_event *re = *(const struct perf_event **)r;
return le->group_index < re->group_index;
}
@@ -3611,8 +3631,10 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
- if (event->attr.pinned)
+ if (event->attr.pinned) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
*can_add_hw = 0;
ctx->rotate_necessary = 1;
@@ -4608,6 +4630,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
atomic_dec(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_dec(&nr_cgroup_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6864,6 +6888,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ perf_output_put(handle, data->cgroup);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
@@ -6907,9 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
- if ((current->mm != NULL) &&
- (__get_user_pages_fast(virt, 1, 0, &p) == 1))
- phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ if (current->mm != NULL) {
+ pagefault_disable();
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ pagefault_enable();
+ }
if (p)
put_page(p);
@@ -7063,6 +7093,16 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+#ifdef CONFIG_CGROUP_PERF
+ if (sample_type & PERF_SAMPLE_CGROUP) {
+ struct cgroup *cgrp;
+
+ /* protected by RCU */
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+ data->cgroup = cgroup_id(cgrp);
+ }
+#endif
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
@@ -7736,6 +7776,105 @@ void perf_event_namespaces(struct task_struct *task)
}
/*
+ * cgroup tracking
+ */
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_event {
+ char *path;
+ int path_size;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ char path[];
+ } event_id;
+};
+
+static int perf_event_cgroup_match(struct perf_event *event)
+{
+ return event->attr.cgroup;
+}
+
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
+{
+ struct perf_cgroup_event *cgroup_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u16 header_size = cgroup_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_cgroup_match(event))
+ return;
+
+ perf_event_header__init_id(&cgroup_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ cgroup_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, cgroup_event->event_id);
+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ cgroup_event->event_id.header.size = header_size;
+}
+
+static void perf_event_cgroup(struct cgroup *cgrp)
+{
+ struct perf_cgroup_event cgroup_event;
+ char path_enomem[16] = "//enomem";
+ char *pathname;
+ size_t size;
+
+ if (!atomic_read(&nr_cgroup_events))
+ return;
+
+ cgroup_event = (struct perf_cgroup_event){
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_CGROUP,
+ .misc = 0,
+ .size = sizeof(cgroup_event.event_id),
+ },
+ .id = cgroup_id(cgrp),
+ },
+ };
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (pathname == NULL) {
+ cgroup_event.path = path_enomem;
+ } else {
+ /* just to be sure to have enough space for alignment */
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
+ cgroup_event.path = pathname;
+ }
+
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(cgroup_event.path) + 1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ cgroup_event.path[size++] = '\0';
+
+ cgroup_event.event_id.header.size += size;
+ cgroup_event.path_size = size;
+
+ perf_iterate_sb(perf_event_cgroup_output,
+ &cgroup_event,
+ NULL);
+
+ kfree(pathname);
+}
+
+#endif
+
+/*
* mmap tracking
*/
@@ -7855,7 +7994,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
+ if (is_vm_hugetlb_page(vma))
flags |= MAP_HUGETLB;
if (file) {
@@ -10778,6 +10917,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
atomic_inc(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_inc(&nr_cgroup_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -11157,6 +11298,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
+ return -EINVAL;
+#endif
+
out:
return ret;
@@ -12754,6 +12901,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
kfree(jc);
}
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ perf_event_cgroup(css->cgroup);
+ return 0;
+}
+
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
@@ -12775,6 +12928,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
+ .css_online = perf_cgroup_css_online,
.attach = perf_cgroup_attach,
/*
* Implicitly enable on dfl hierarchy so that perf events can