summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-14 03:13:47 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-14 03:13:47 +0300
commit17ca7fc22f4bbc795e4d136449521b2fecb88e06 (patch)
treea1bee109d288772d04c4832109593e0e736fc6e3
parent48fc82c40bc29a80361b1eab0e4a9494628a7144 (diff)
parent854dd99b5ddc9d90e31e5f112462a5994dd31810 (diff)
downloadlinux-17ca7fc22f4bbc795e4d136449521b2fecb88e06.tar.xz
Merge tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf events updates from Ingo Molnar: - Combine perf and BPF for fast evalution of HW breakpoint conditions - Add LBR capture support outside of hardware events - Trigger IO signals for watermark_wakeup - Add RAPL support for Intel Arrow Lake and Lunar Lake - Optimize frequency-throttling - Miscellaneous cleanups & fixes * tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits) perf/bpf: Mark perf_event_set_bpf_handler() and perf_event_free_bpf_handler() as inline too selftests/perf_events: Test FASYNC with watermark wakeups perf/ring_buffer: Trigger IO signals for watermark_wakeup perf: Move perf_event_fasync() to perf_event.h perf/bpf: Change the !CONFIG_BPF_SYSCALL stubs to static inlines selftest/bpf: Test a perf BPF program that suppresses side effects perf/bpf: Allow a BPF program to suppress all sample side effects perf/bpf: Remove unneeded uses_default_overflow_handler() perf/bpf: Call BPF handler directly, not through overflow machinery perf/bpf: Remove #ifdef CONFIG_BPF_SYSCALL from struct perf_event members perf/bpf: Create bpf_overflow_handler() stub for !CONFIG_BPF_SYSCALL perf/bpf: Reorder bpf_overflow_handler() ahead of __perf_event_overflow() perf/x86/rapl: Add support for Intel Lunar Lake perf/x86/rapl: Add support for Intel Arrow Lake perf/core: Reduce PMU access to adjust sample freq perf/core: Optimize perf_adjust_freq_unthr_context() perf/x86/amd: Don't reject non-sampling events with configured LBR perf/x86/amd: Support capturing LBR from software events perf/x86/amd: Avoid taking branches before disabling LBR perf/x86/amd: Ensure amd_pmu_core_disable_all() is always inlined ...
-rw-r--r--arch/arm/kernel/hw_breakpoint.c8
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c4
-rw-r--r--arch/x86/events/amd/core.c37
-rw-r--r--arch/x86/events/amd/lbr.c13
-rw-r--r--arch/x86/events/perf_event.h13
-rw-r--r--arch/x86/events/rapl.c7
-rw-r--r--include/linux/perf_event.h37
-rw-r--r--kernel/events/core.c273
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_skip.c137
-rw-r--r--tools/testing/selftests/bpf/progs/test_perf_skip.c15
-rw-r--r--tools/testing/selftests/perf_events/.gitignore1
-rw-r--r--tools/testing/selftests/perf_events/Makefile2
-rw-r--r--tools/testing/selftests/perf_events/watermark_signal.c146
14 files changed, 525 insertions, 172 deletions
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index dc0fb7a81371..054e9199f30d 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -626,7 +626,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
hw->address &= ~alignment_mask;
hw->ctrl.len <<= offset;
- if (uses_default_overflow_handler(bp)) {
+ if (is_default_overflow_handler(bp)) {
/*
* Mismatch breakpoints are required for single-stepping
* breakpoints.
@@ -798,7 +798,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* Otherwise, insert a temporary mismatch breakpoint so that
* we can single-step over the watchpoint trigger.
*/
- if (!uses_default_overflow_handler(wp))
+ if (!is_default_overflow_handler(wp))
continue;
step:
enable_single_step(wp, instruction_pointer(regs));
@@ -811,7 +811,7 @@ step:
info->trigger = addr;
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
perf_bp_event(wp, regs);
- if (uses_default_overflow_handler(wp))
+ if (is_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs));
}
@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
info->trigger = addr;
pr_debug("breakpoint fired: address = 0x%x\n", addr);
perf_bp_event(bp, regs);
- if (uses_default_overflow_handler(bp))
+ if (is_default_overflow_handler(bp))
enable_single_step(bp, addr);
goto unlock;
}
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 2f5755192c2b..722ac45f9f7b 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -655,7 +655,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr,
perf_bp_event(bp, regs);
/* Do we need to handle the stepping? */
- if (uses_default_overflow_handler(bp))
+ if (is_default_overflow_handler(bp))
step = 1;
unlock:
rcu_read_unlock();
@@ -734,7 +734,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
static int watchpoint_report(struct perf_event *wp, unsigned long addr,
struct pt_regs *regs)
{
- int step = uses_default_overflow_handler(wp);
+ int step = is_default_overflow_handler(wp);
struct arch_hw_breakpoint *info = counter_arch_bp(wp);
info->trigger = addr;
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 985ef3b47919..1fc4ce44e743 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -647,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
-static inline void amd_pmu_set_global_ctl(u64 ctl)
+static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
{
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
@@ -907,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return amd_pmu_adjust_nmi_window(handled);
}
+/*
+ * AMD-specific callback invoked through perf_snapshot_branch_stack static
+ * call, defined in include/linux/perf_event.h. See its definition for API
+ * details. It's up to caller to provide enough space in *entries* to fit all
+ * LBR records, otherwise returned result will be truncated to *cnt* entries.
+ */
+static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+
+ /*
+ * The sequence of steps to freeze LBR should be completely inlined
+ * and contain no branches to minimize contamination of LBR snapshot
+ */
+ local_irq_save(flags);
+ amd_pmu_core_disable_all();
+ __amd_pmu_lbr_disable();
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ amd_pmu_lbr_read();
+ cnt = min(cnt, x86_pmu.lbr_nr);
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+
+ amd_pmu_v2_enable_all(0);
+ local_irq_restore(flags);
+
+ return cnt;
+}
+
static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1443,6 +1474,10 @@ static int __init amd_core_pmu_init(void)
static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+
+ /* Only support branch_stack snapshot on perfmon v2 */
+ if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
+ static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
} else if (!amd_brs_init()) {
/*
* BRS requires special event constraints and flushing on ctxsw.
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 5149830c7c4f..19c7b76e21bc 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event)
{
int ret = 0;
- /* LBR is not recommended in counting mode */
- if (!is_sampling_event(event))
- return -EINVAL;
-
ret = amd_pmu_lbr_setup_filter(event);
if (!ret)
event->attach_state |= PERF_ATTACH_SCHED_CB;
@@ -414,18 +410,11 @@ void amd_pmu_lbr_enable_all(void)
void amd_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- u64 dbg_ctl, dbg_extn_cfg;
if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
return;
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
-
- if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
- rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- }
+ __amd_pmu_lbr_disable();
}
__init int amd_pmu_lbr_init(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index fb56518356ec..72b022a1e16c 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
+static __always_inline void __amd_pmu_lbr_disable(void)
+{
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
+}
+
#ifdef CONFIG_PERF_EVENTS_AMD_BRS
#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index fb2b1961e5a3..ca5f687fa420 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = {
static int __init init_rapl_pmus(void)
{
int maxdie = topology_max_packages() * topology_max_dies_per_package();
- size_t size;
- size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
- rapl_pmus = kzalloc(size, GFP_KERNEL);
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
@@ -808,6 +806,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d2a15c0c6f8a..a5304ae8c654 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -809,11 +809,8 @@ struct perf_event {
u64 (*clock)(void);
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;
-#ifdef CONFIG_BPF_SYSCALL
- perf_overflow_handler_t orig_overflow_handler;
struct bpf_prog *prog;
u64 bpf_cookie;
-#endif
#ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event;
@@ -883,6 +880,7 @@ struct perf_event_pmu_context {
unsigned int nr_events;
unsigned int nr_cgroups;
+ unsigned int nr_freq;
atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
@@ -897,6 +895,11 @@ struct perf_event_pmu_context {
int rotate_necessary;
};
+static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
+{
+ return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
+}
+
struct perf_event_groups {
struct rb_root tree;
u64 index;
@@ -1342,8 +1345,10 @@ extern int perf_event_output(struct perf_event *event,
struct pt_regs *regs);
static inline bool
-__is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
+is_default_overflow_handler(struct perf_event *event)
{
+ perf_overflow_handler_t overflow_handler = event->overflow_handler;
+
if (likely(overflow_handler == perf_event_output_forward))
return true;
if (unlikely(overflow_handler == perf_event_output_backward))
@@ -1351,22 +1356,6 @@ __is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
return false;
}
-#define is_default_overflow_handler(event) \
- __is_default_overflow_handler((event)->overflow_handler)
-
-#ifdef CONFIG_BPF_SYSCALL
-static inline bool uses_default_overflow_handler(struct perf_event *event)
-{
- if (likely(is_default_overflow_handler(event)))
- return true;
-
- return __is_default_overflow_handler(event->orig_overflow_handler);
-}
-#else
-#define uses_default_overflow_handler(event) \
- is_default_overflow_handler(event)
-#endif
-
extern void
perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
@@ -1697,6 +1686,14 @@ perf_event_addr_filters(struct perf_event *event)
return ifh;
}
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+ /* Only the parent has fasync state */
+ if (event->parent)
+ event = event->parent;
+ return &event->fasync;
+}
+
extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..6b0a66ed2ae3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2302,8 +2302,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu--;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq--;
+ epc->nr_freq--;
+ }
if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0;
@@ -2558,9 +2560,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event))
cpc->active_oncpu++;
- if (event->attr.freq && event->attr.sample_freq)
+ if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq++;
-
+ epc->nr_freq++;
+ }
if (event->attr.exclusive)
cpc->exclusive = 1;
@@ -4123,30 +4126,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
}
}
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void
-perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;
- /*
- * only need to iterate over all events iff:
- * - context have events in frequency mode (needs freq adjust)
- * - there are events to unthrottle on this cpu
- */
- if (!(ctx->nr_freq || unthrottle))
- return;
-
- raw_spin_lock(&ctx->lock);
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ list_for_each_entry(event, event_list, active_list) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -4154,18 +4141,17 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
if (!event_filter_match(event))
continue;
- perf_pmu_disable(event->pmu);
-
hwc = &event->hw;
if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
- event->pmu->start(event, 0);
+ if (!event->attr.freq || !event->attr.sample_freq)
+ event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
- goto next;
+ continue;
/*
* stop the event and update event->count
@@ -4187,8 +4173,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
- next:
- perf_pmu_enable(event->pmu);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || unthrottle))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (!(pmu_ctx->nr_freq || unthrottle))
+ continue;
+ if (!perf_pmu_ctx_is_active(pmu_ctx))
+ continue;
+ if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ continue;
+
+ perf_pmu_disable(pmu_ctx->pmu);
+ perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+ perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+ perf_pmu_enable(pmu_ctx->pmu);
}
raw_spin_unlock(&ctx->lock);
@@ -6684,14 +6703,6 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
-static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
-{
- /* only the parent has fasync state */
- if (event->parent)
- event = event->parent;
- return &event->fasync;
-}
-
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
@@ -9544,6 +9555,100 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r
return true;
}
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .event = event,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+
+ return ret;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ return -EPROTO;
+ }
+
+ event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
+ return 0;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static inline int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return 1;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
/*
* Generic event overflow handling, sampling.
*/
@@ -9564,6 +9669,9 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
+ if (event->prog && !bpf_overflow_handler(event, data, regs))
+ return ret;
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -10422,97 +10530,6 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
-#ifdef CONFIG_BPF_SYSCALL
-static void bpf_overflow_handler(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- struct bpf_perf_event_data_kern ctx = {
- .data = data,
- .event = event,
- };
- struct bpf_prog *prog;
- int ret = 0;
-
- ctx.regs = perf_arch_bpf_user_pt_regs(regs);
- if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
- goto out;
- rcu_read_lock();
- prog = READ_ONCE(event->prog);
- if (prog) {
- perf_prepare_sample(data, event, regs);
- ret = bpf_prog_run(prog, &ctx);
- }
- rcu_read_unlock();
-out:
- __this_cpu_dec(bpf_prog_active);
- if (!ret)
- return;
-
- event->orig_overflow_handler(event, data, regs);
-}
-
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- if (event->overflow_handler_context)
- /* hw breakpoint or kernel counter */
- return -EINVAL;
-
- if (event->prog)
- return -EEXIST;
-
- if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
- return -EINVAL;
-
- if (event->attr.precise_ip &&
- prog->call_get_stack &&
- (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
- event->attr.exclude_callchain_kernel ||
- event->attr.exclude_callchain_user)) {
- /*
- * On perf_event with precise_ip, calling bpf_get_stack()
- * may trigger unwinder warnings and occasional crashes.
- * bpf_get_[stack|stackid] works around this issue by using
- * callchain attached to perf_sample_data. If the
- * perf_event does not full (kernel and user) callchain
- * attached to perf_sample_data, do not allow attaching BPF
- * program that calls bpf_get_[stack|stackid].
- */
- return -EPROTO;
- }
-
- event->prog = prog;
- event->bpf_cookie = bpf_cookie;
- event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
- WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
- return 0;
-}
-
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
- struct bpf_prog *prog = event->prog;
-
- if (!prog)
- return;
-
- WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
- event->prog = NULL;
- bpf_prog_put(prog);
-}
-#else
-static int perf_event_set_bpf_handler(struct perf_event *event,
- struct bpf_prog *prog,
- u64 bpf_cookie)
-{
- return -EOPNOTSUPP;
-}
-static void perf_event_free_bpf_handler(struct perf_event *event)
-{
-}
-#endif
-
/*
* returns true if the event is a tracepoint, or a kprobe/upprobe created
* with perf_event_open()
@@ -11971,13 +11988,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
- if (overflow_handler == bpf_overflow_handler) {
+ if (parent_event->prog) {
struct bpf_prog *prog = parent_event->prog;
bpf_prog_inc(prog);
event->prog = prog;
- event->orig_overflow_handler =
- parent_event->orig_overflow_handler;
}
#endif
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 60ed43d1c29e..4013408ce012 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,6 +22,10 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
+
+ if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
+ handle->event->pending_kill = POLL_IN;
+
irq_work_queue(&handle->event->pending_irq);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_skip.c b/tools/testing/selftests/bpf/prog_tests/perf_skip.c
new file mode 100644
index 000000000000..37d8618800e4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_skip.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <test_progs.h>
+#include "test_perf_skip.skel.h"
+#include <linux/compiler.h>
+#include <linux/hw_breakpoint.h>
+#include <sys/mman.h>
+
+#ifndef TRAP_PERF
+#define TRAP_PERF 6
+#endif
+
+int sigio_count, sigtrap_count;
+
+static void handle_sigio(int sig __always_unused)
+{
+ ++sigio_count;
+}
+
+static void handle_sigtrap(int signum __always_unused,
+ siginfo_t *info,
+ void *ucontext __always_unused)
+{
+ ASSERT_EQ(info->si_code, TRAP_PERF, "si_code");
+ ++sigtrap_count;
+}
+
+static noinline int test_function(void)
+{
+ asm volatile ("");
+ return 0;
+}
+
+void serial_test_perf_skip(void)
+{
+ struct sigaction action = {};
+ struct sigaction previous_sigtrap;
+ sighandler_t previous_sigio = SIG_ERR;
+ struct test_perf_skip *skel = NULL;
+ struct perf_event_attr attr = {};
+ int perf_fd = -1;
+ int err;
+ struct f_owner_ex owner;
+ struct bpf_link *prog_link = NULL;
+
+ action.sa_flags = SA_SIGINFO | SA_NODEFER;
+ action.sa_sigaction = handle_sigtrap;
+ sigemptyset(&action.sa_mask);
+ if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction"))
+ return;
+
+ previous_sigio = signal(SIGIO, handle_sigio);
+ if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal"))
+ goto cleanup;
+
+ skel = test_perf_skip__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_load"))
+ goto cleanup;
+
+ attr.type = PERF_TYPE_BREAKPOINT;
+ attr.size = sizeof(attr);
+ attr.bp_type = HW_BREAKPOINT_X;
+ attr.bp_addr = (uintptr_t)test_function;
+ attr.bp_len = sizeof(long);
+ attr.sample_period = 1;
+ attr.sample_type = PERF_SAMPLE_IP;
+ attr.pinned = 1;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.precise_ip = 3;
+ attr.sigtrap = 1;
+ attr.remove_on_exec = 1;
+
+ perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+ if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
+ printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n");
+ test__skip();
+ goto cleanup;
+ }
+ if (!ASSERT_OK(perf_fd < 0, "perf_event_open"))
+ goto cleanup;
+
+ /* Configure the perf event to signal on sample. */
+ err = fcntl(perf_fd, F_SETFL, O_ASYNC);
+ if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)"))
+ goto cleanup;
+
+ owner.type = F_OWNER_TID;
+ owner.pid = syscall(__NR_gettid);
+ err = fcntl(perf_fd, F_SETOWN_EX, &owner);
+ if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)"))
+ goto cleanup;
+
+ /* Allow at most one sample. A sample rejected by bpf should
+ * not count against this.
+ */
+ err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1);
+ if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)"))
+ goto cleanup;
+
+ prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd);
+ if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event"))
+ goto cleanup;
+
+ /* Configure the bpf program to suppress the sample. */
+ skel->bss->ip = (uintptr_t)test_function;
+ test_function();
+
+ ASSERT_EQ(sigio_count, 0, "sigio_count");
+ ASSERT_EQ(sigtrap_count, 0, "sigtrap_count");
+
+ /* Configure the bpf program to allow the sample. */
+ skel->bss->ip = 0;
+ test_function();
+
+ ASSERT_EQ(sigio_count, 1, "sigio_count");
+ ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
+
+ /* Test that the sample above is the only one allowed (by perf, not
+ * by bpf)
+ */
+ test_function();
+
+ ASSERT_EQ(sigio_count, 1, "sigio_count");
+ ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
+
+cleanup:
+ bpf_link__destroy(prog_link);
+ if (perf_fd >= 0)
+ close(perf_fd);
+ test_perf_skip__destroy(skel);
+
+ if (previous_sigio != SIG_ERR)
+ signal(SIGIO, previous_sigio);
+ sigaction(SIGTRAP, &previous_sigtrap, NULL);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_perf_skip.c b/tools/testing/selftests/bpf/progs/test_perf_skip.c
new file mode 100644
index 000000000000..7eb8b6de7a57
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_skip.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+uintptr_t ip;
+
+SEC("perf_event")
+int handler(struct bpf_perf_event_data *data)
+{
+ /* Skip events that have the correct ip. */
+ return ip != PT_REGS_IP(&data->regs);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore
index 790c47001e77..ee93dc4969b8 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
sigtrap_threads
remove_on_exec
+watermark_signal
diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile
index db93c4ff081a..70e3ff211278 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread
-TEST_GEN_PROGS := sigtrap_threads remove_on_exec
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c
new file mode 100644
index 000000000000..49dc1e831174
--- /dev/null
+++ b/tools/testing/selftests/perf_events/watermark_signal.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <stddef.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#define __maybe_unused __attribute__((__unused__))
+
+static int sigio_count;
+
+static void handle_sigio(int signum __maybe_unused,
+ siginfo_t *oh __maybe_unused,
+ void *uc __maybe_unused)
+{
+ ++sigio_count;
+}
+
+static void do_child(void)
+{
+ raise(SIGSTOP);
+
+ for (int i = 0; i < 20; ++i)
+ sleep(1);
+
+ raise(SIGSTOP);
+
+ exit(0);
+}
+
+TEST(watermark_signal)
+{
+ struct perf_event_attr attr;
+ struct perf_event_mmap_page *p = NULL;
+ struct sigaction previous_sigio, sigio = { 0 };
+ pid_t child = -1;
+ int child_status;
+ int fd = -1;
+ long page_size = sysconf(_SC_PAGE_SIZE);
+
+ sigio.sa_sigaction = handle_sigio;
+ EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0);
+
+ memset(&attr, 0, sizeof(attr));
+ attr.size = sizeof(attr);
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = PERF_COUNT_SW_DUMMY;
+ attr.sample_period = 1;
+ attr.disabled = 1;
+ attr.watermark = 1;
+ attr.context_switch = 1;
+ attr.wakeup_watermark = 1;
+
+ child = fork();
+ EXPECT_GE(child, 0);
+ if (child == 0)
+ do_child();
+ else if (child < 0) {
+ perror("fork()");
+ goto cleanup;
+ }
+
+ if (waitpid(child, &child_status, WSTOPPED) != child ||
+ !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) {
+ fprintf(stderr,
+ "failed to sycnhronize with child errno=%d status=%x\n",
+ errno,
+ child_status);
+ goto cleanup;
+ }
+
+ fd = syscall(__NR_perf_event_open, &attr, child, -1, -1,
+ PERF_FLAG_FD_CLOEXEC);
+ if (fd < 0) {
+ fprintf(stderr, "failed opening event %llx\n", attr.config);
+ goto cleanup;
+ }
+
+ if (fcntl(fd, F_SETFL, FASYNC)) {
+ perror("F_SETFL FASYNC");
+ goto cleanup;
+ }
+
+ if (fcntl(fd, F_SETOWN, getpid())) {
+ perror("F_SETOWN getpid()");
+ goto cleanup;
+ }
+
+ if (fcntl(fd, F_SETSIG, SIGIO)) {
+ perror("F_SETSIG SIGIO");
+ goto cleanup;
+ }
+
+ p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (p == NULL) {
+ perror("mmap");
+ goto cleanup;
+ }
+
+ if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) {
+ perror("PERF_EVENT_IOC_ENABLE");
+ goto cleanup;
+ }
+
+ if (kill(child, SIGCONT) < 0) {
+ perror("SIGCONT");
+ goto cleanup;
+ }
+
+ if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR)
+ fprintf(stderr,
+ "expected SIGIO to terminate wait errno=%d status=%x\n%d",
+ errno,
+ child_status,
+ sigio_count);
+
+ EXPECT_GE(sigio_count, 1);
+
+cleanup:
+ if (p != NULL)
+ munmap(p, 2 * page_size);
+
+ if (fd >= 0)
+ close(fd);
+
+ if (child > 0) {
+ kill(child, SIGKILL);
+ waitpid(child, NULL, 0);
+ }
+
+ sigaction(SIGIO, &previous_sigio, NULL);
+}
+
+TEST_HARNESS_MAIN