summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2024-04-04 23:08:01 +0300
committerAlexei Starovoitov <ast@kernel.org>2024-04-04 23:08:01 +0300
commitd82c045f9dfde6b9ea220d7f8310c98210dfc8cb (patch)
tree05376a14c790f914df2197a0431c3608ac025e66
parent21ab0b6d0cfcb8aa98e33baa83f933f963514027 (diff)
parent314a53623cd4e62e1b88126e5ed2bc87073d90ee (diff)
downloadlinux-d82c045f9dfde6b9ea220d7f8310c98210dfc8cb.tar.xz
Merge branch 'inline-bpf_get_branch_snapshot-bpf-helper'
Andrii Nakryiko says: ==================== Inline bpf_get_branch_snapshot() BPF helper Implement inlining of bpf_get_branch_snapshot() BPF helper using generic BPF assembly approach. This allows to reduce LBR record usage right before LBR records are captured from inside BPF program. See v1 cover letter ([0]) for some visual examples. I dropped them from v2 because there are multiple independent changes landing and being reviewed, all of which remove different parts of LBR record waste, so presenting final state of LBR "waste" gets more complicated until all of the pieces land. [0] https://lore.kernel.org/bpf/20240321180501.734779-1-andrii@kernel.org/ v2->v3: - fix BPF_MUL instruction definition; v1->v2: - inlining of bpf_get_smp_processor_id() split out into a separate patch set implementing internal per-CPU BPF instruction; - add efficient divide-by-24 through multiplication logic, and leave comments to explain the idea behind it; this way inlined version of bpf_get_branch_snapshot() has no compromises compared to non-inlined version of the helper (Alexei). ==================== Link: https://lore.kernel.org/r/20240404002640.1774210-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--kernel/bpf/verifier.c55
-rw-r--r--kernel/trace/bpf_trace.c4
2 files changed, 55 insertions, 4 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1e03ba9ed07b..ffaa9f7f153c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20188,6 +20188,61 @@ patch_map_ops_generic:
goto next_insn;
}
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Divison by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
/* Implement bpf_kptr_xchg inline */
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_kptr_xchg &&
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6d0c95638e1b..afb232b1d7c2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1188,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
u32 entry_cnt = size / br_entry_size;
@@ -1203,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return -ENOENT;
return entry_cnt * br_entry_size;
-#endif
}
static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {