diff options
Diffstat (limited to 'tools')
383 files changed, 21596 insertions, 3581 deletions
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h index 4d471d9511a5..6fffe5682713 100644 --- a/tools/arch/ia64/include/asm/barrier.h +++ b/tools/arch/ia64/include/asm/barrier.h @@ -39,9 +39,6 @@ * sequential memory pages only. */ -/* XXX From arch/ia64/include/uapi/asm/gcc_intrin.h */ -#define ia64_mf() asm volatile ("mf" ::: "memory") - #define mb() ia64_mf() #define rmb() mb() #define wmb() mb() diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 877827b7c2c3..a61051400311 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -6,7 +6,7 @@ * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ -#include "inat_types.h" +#include "inat_types.h" /* __ignore_sync_check__ */ /* * Internal bits. Don't use bitmasks directly, because these bits are diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index cc777c185212..dc632b41f135 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -9,7 +9,7 @@ #include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ -#include "inat.h" +#include "inat.h" /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) @@ -132,13 +132,25 @@ struct insn { #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); -extern void insn_get_prefixes(struct insn *insn); -extern void insn_get_opcode(struct insn *insn); -extern void insn_get_modrm(struct insn *insn); -extern void insn_get_sib(struct insn *insn); -extern void insn_get_displacement(struct insn *insn); -extern void insn_get_immediate(struct insn *insn); -extern void insn_get_length(struct insn *insn); +extern int insn_get_prefixes(struct insn *insn); +extern int insn_get_opcode(struct insn *insn); +extern int insn_get_modrm(struct insn *insn); +extern int insn_get_sib(struct insn *insn); +extern int insn_get_displacement(struct insn *insn); +extern int insn_get_immediate(struct insn *insn); +extern int insn_get_length(struct insn *insn); + +enum insn_mode { + INSN_MODE_32, + INSN_MODE_64, + /* Mode is determined by the current kernel build. */ + INSN_MODE_KERN, + INSN_NUM_MODES, +}; + +extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); + +#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) @@ -149,17 +161,6 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); -/* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, - const void *kaddr, int buf_len) -{ -#ifdef CONFIG_X86_64 - insn_init(insn, kaddr, buf_len, 1); -#else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, buf_len, 0); -#endif -} - static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -179,13 +180,6 @@ static inline int insn_has_emulate_prefix(struct insn *insn) return !!insn->emulate_prefix_size; } -/* Ensure this instruction is decoded completely */ -static inline int insn_complete(struct insn *insn) -{ - return insn->opcode.got && insn->modrm.got && insn->sib.got && - insn->displacement.got && insn->immediate.got; -} - static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 546d6ecf0a35..45029354e0a8 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -628,8 +628,6 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define MSR_IA32_TSCDEADLINE 0x000006e0 - #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/tools/arch/x86/include/asm/nops.h b/tools/arch/x86/include/asm/nops.h new file mode 100644 index 000000000000..c1e5e818ba16 --- /dev/null +++ b/tools/arch/x86/include/asm/nops.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_NOPS_H +#define _ASM_X86_NOPS_H + +/* + * Define nops for use with alternative() and for tracing. + */ + +#ifndef CONFIG_64BIT + +/* + * Generic 32bit nops from GAS: + * + * 1: nop + * 2: movl %esi,%esi + * 3: leal 0x0(%esi),%esi + * 4: leal 0x0(%esi,%eiz,1),%esi + * 5: leal %ds:0x0(%esi,%eiz,1),%esi + * 6: leal 0x0(%esi),%esi + * 7: leal 0x0(%esi,%eiz,1),%esi + * 8: leal %ds:0x0(%esi,%eiz,1),%esi + * + * Except 5 and 8, which are DS prefixed 4 and 7 resp, where GAS would emit 2 + * nop instructions. + */ +#define BYTES_NOP1 0x90 +#define BYTES_NOP2 0x89,0xf6 +#define BYTES_NOP3 0x8d,0x76,0x00 +#define BYTES_NOP4 0x8d,0x74,0x26,0x00 +#define BYTES_NOP5 0x3e,BYTES_NOP4 +#define BYTES_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 +#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 +#define BYTES_NOP8 0x3e,BYTES_NOP7 + +#else + +/* + * Generic 64bit nops from GAS: + * + * 1: nop + * 2: osp nop + * 3: nopl (%eax) + * 4: nopl 0x00(%eax) + * 5: nopl 0x00(%eax,%eax,1) + * 6: osp nopl 0x00(%eax,%eax,1) + * 7: nopl 0x00000000(%eax) + * 8: nopl 0x00000000(%eax,%eax,1) + */ +#define BYTES_NOP1 0x90 +#define BYTES_NOP2 0x66,BYTES_NOP1 +#define BYTES_NOP3 0x0f,0x1f,0x00 +#define BYTES_NOP4 0x0f,0x1f,0x40,0x00 +#define BYTES_NOP5 0x0f,0x1f,0x44,0x00,0x00 +#define BYTES_NOP6 0x66,BYTES_NOP5 +#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 +#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 + +#endif /* CONFIG_64BIT */ + +#ifdef __ASSEMBLY__ +#define _ASM_MK_NOP(x) .byte x +#else +#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" +#endif + +#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8) + +#define ASM_NOP_MAX 8 + +#ifndef __ASSEMBLY__ +extern const unsigned char * const x86_nops[]; +#endif + +#endif /* _ASM_X86_NOPS_H */ diff --git a/tools/arch/x86/kcpuid/Makefile b/tools/arch/x86/kcpuid/Makefile new file mode 100644 index 000000000000..87b554fab14b --- /dev/null +++ b/tools/arch/x86/kcpuid/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for x86/kcpuid tool + +kcpuid : kcpuid.c + +CFLAGS = -Wextra + +BINDIR ?= /usr/sbin + +HWDATADIR ?= /usr/share/misc/ + +override CFLAGS += -O2 -Wall -I../../../include + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +.PHONY : clean +clean : + @rm -f kcpuid + +install : kcpuid + install -d $(DESTDIR)$(BINDIR) + install -m 755 -p kcpuid $(DESTDIR)$(BINDIR)/kcpuid + install -m 444 -p cpuid.csv $(HWDATADIR)/cpuid.csv diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv new file mode 100644 index 000000000000..4f1c4b0c29e9 --- /dev/null +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -0,0 +1,400 @@ +# The basic row format is: +# LEAF, SUBLEAF, register_name, bits, short_name, long_description + +# Leaf 00H + 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs + +# Leaf 01H + 1, 0, EAX, 3:0, stepping, Stepping ID + 1, 0, EAX, 7:4, model, Model + 1, 0, EAX, 11:8, family, Family ID + 1, 0, EAX, 13:12, processor, Processor Type + 1, 0, EAX, 19:16, model_ext, Extended Model ID + 1, 0, EAX, 27:20, family_ext, Extended Family ID + + 1, 0, EBX, 7:0, brand, Brand Index + 1, 0, EBX, 15:8, clflush_size, CLFLUSH line size (value * 8) in bytes + 1, 0, EBX, 23:16, max_cpu_id, Maxim number of addressable logic cpu in this package + 1, 0, EBX, 31:24, apic_id, Initial APIC ID + + 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) + 1, 0, ECX, 1, pclmulqdq, PCLMULQDQ instruction supported + 1, 0, ECX, 2, dtes64, DS area uses 64-bit layout + 1, 0, ECX, 3, mwait, MONITOR/MWAIT supported + 1, 0, ECX, 4, ds_cpl, CPL Qualified Debug Store which allows for branch message storage qualified by CPL + 1, 0, ECX, 5, vmx, Virtual Machine Extensions supported + 1, 0, ECX, 6, smx, Safer Mode Extension supported + 1, 0, ECX, 7, eist, Enhanced Intel SpeedStep Technology + 1, 0, ECX, 8, tm2, Thermal Monitor 2 + 1, 0, ECX, 9, ssse3, Supplemental Streaming SIMD Extensions 3 (SSSE3) + 1, 0, ECX, 10, l1_ctx_id, L1 data cache could be set to either adaptive mode or shared mode (check IA32_MISC_ENABLE bit 24 definition) + 1, 0, ECX, 11, sdbg, IA32_DEBUG_INTERFACE MSR for silicon debug supported + 1, 0, ECX, 12, fma, FMA extensions using YMM state supported + 1, 0, ECX, 13, cmpxchg16b, 'CMPXCHG16B - Compare and Exchange Bytes' supported + 1, 0, ECX, 14, xtpr_update, xTPR Update Control supported + 1, 0, ECX, 15, pdcm, Perfmon and Debug Capability present + 1, 0, ECX, 17, pcid, Process-Context Identifiers feature present + 1, 0, ECX, 18, dca, Prefetching data from a memory mapped device supported + 1, 0, ECX, 19, sse4_1, SSE4.1 feature present + 1, 0, ECX, 20, sse4_2, SSE4.2 feature present + 1, 0, ECX, 21, x2apic, x2APIC supported + 1, 0, ECX, 22, movbe, MOVBE instruction supported + 1, 0, ECX, 23, popcnt, POPCNT instruction supported + 1, 0, ECX, 24, tsc_deadline_timer, LAPIC supports one-shot operation using a TSC deadline value + 1, 0, ECX, 25, aesni, AESNI instruction supported + 1, 0, ECX, 26, xsave, XSAVE/XRSTOR processor extended states (XSETBV/XGETBV/XCR0) + 1, 0, ECX, 27, osxsave, OS has set CR4.OSXSAVE bit to enable XSETBV/XGETBV/XCR0 + 1, 0, ECX, 28, avx, AVX instruction supported + 1, 0, ECX, 29, f16c, 16-bit floating-point conversion instruction supported + 1, 0, ECX, 30, rdrand, RDRAND instruction supported + + 1, 0, EDX, 0, fpu, x87 FPU on chip + 1, 0, EDX, 1, vme, Virtual-8086 Mode Enhancement + 1, 0, EDX, 2, de, Debugging Extensions + 1, 0, EDX, 3, pse, Page Size Extensions + 1, 0, EDX, 4, tsc, Time Stamp Counter + 1, 0, EDX, 5, msr, RDMSR and WRMSR Support + 1, 0, EDX, 6, pae, Physical Address Extensions + 1, 0, EDX, 7, mce, Machine Check Exception + 1, 0, EDX, 8, cx8, CMPXCHG8B instr + 1, 0, EDX, 9, apic, APIC on Chip + 1, 0, EDX, 11, sep, SYSENTER and SYSEXIT instrs + 1, 0, EDX, 12, mtrr, Memory Type Range Registers + 1, 0, EDX, 13, pge, Page Global Bit + 1, 0, EDX, 14, mca, Machine Check Architecture + 1, 0, EDX, 15, cmov, Conditional Move Instrs + 1, 0, EDX, 16, pat, Page Attribute Table + 1, 0, EDX, 17, pse36, 36-Bit Page Size Extension + 1, 0, EDX, 18, psn, Processor Serial Number + 1, 0, EDX, 19, clflush, CLFLUSH instr +# 1, 0, EDX, 20, + 1, 0, EDX, 21, ds, Debug Store + 1, 0, EDX, 22, acpi, Thermal Monitor and Software Controlled Clock Facilities + 1, 0, EDX, 23, mmx, Intel MMX Technology + 1, 0, EDX, 24, fxsr, XSAVE and FXRSTOR Instrs + 1, 0, EDX, 25, sse, SSE + 1, 0, EDX, 26, sse2, SSE2 + 1, 0, EDX, 27, ss, Self Snoop + 1, 0, EDX, 28, hit, Max APIC IDs + 1, 0, EDX, 29, tm, Thermal Monitor +# 1, 0, EDX, 30, + 1, 0, EDX, 31, pbe, Pending Break Enable + +# Leaf 02H +# cache and TLB descriptor info + +# Leaf 03H +# Precessor Serial Number, introduced on Pentium III, not valid for +# latest models + +# Leaf 04H +# thread/core and cache topology + 4, 0, EAX, 4:0, cache_type, Cache type like instr/data or unified + 4, 0, EAX, 7:5, cache_level, Cache Level (starts at 1) + 4, 0, EAX, 8, cache_self_init, Cache Self Initialization + 4, 0, EAX, 9, fully_associate, Fully Associative cache +# 4, 0, EAX, 13:10, resvd, resvd + 4, 0, EAX, 25:14, max_logical_id, Max number of addressable IDs for logical processors sharing the cache + 4, 0, EAX, 31:26, max_phy_id, Max number of addressable IDs for processors in phy package + + 4, 0, EBX, 11:0, cache_linesize, Size of a cache line in bytes + 4, 0, EBX, 21:12, cache_partition, Physical Line partitions + 4, 0, EBX, 31:22, cache_ways, Ways of associativity + 4, 0, ECX, 31:0, cache_sets, Number of Sets - 1 + 4, 0, EDX, 0, c_wbinvd, 1 means WBINVD/INVD is not ganranteed to act upon lower level caches of non-originating threads sharing this cache + 4, 0, EDX, 1, c_incl, Whether cache is inclusive of lower cache level + 4, 0, EDX, 2, c_comp_index, Complex Cache Indexing + +# Leaf 05H +# MONITOR/MWAIT + 5, 0, EAX, 15:0, min_mon_size, Smallest monitor line size in bytes + 5, 0, EBX, 15:0, max_mon_size, Largest monitor line size in bytes + 5, 0, ECX, 0, mwait_ext, Enum of Monitor-Mwait extensions supported + 5, 0, ECX, 1, mwait_irq_break, Largest monitor line size in bytes + 5, 0, EDX, 3:0, c0_sub_stats, Number of C0* sub C-states supported using MWAIT + 5, 0, EDX, 7:4, c1_sub_stats, Number of C1* sub C-states supported using MWAIT + 5, 0, EDX, 11:8, c2_sub_stats, Number of C2* sub C-states supported using MWAIT + 5, 0, EDX, 15:12, c3_sub_stats, Number of C3* sub C-states supported using MWAIT + 5, 0, EDX, 19:16, c4_sub_stats, Number of C4* sub C-states supported using MWAIT + 5, 0, EDX, 23:20, c5_sub_stats, Number of C5* sub C-states supported using MWAIT + 5, 0, EDX, 27:24, c6_sub_stats, Number of C6* sub C-states supported using MWAIT + 5, 0, EDX, 31:28, c7_sub_stats, Number of C7* sub C-states supported using MWAIT + +# Leaf 06H +# Thermal & Power Management + + 6, 0, EAX, 0, dig_temp, Digital temperature sensor supported + 6, 0, EAX, 1, turbo, Intel Turbo Boost + 6, 0, EAX, 2, arat, Always running APIC timer +# 6, 0, EAX, 3, resv, Reserved + 6, 0, EAX, 4, pln, Power limit notifications supported + 6, 0, EAX, 5, ecmd, Clock modulation duty cycle extension supported + 6, 0, EAX, 6, ptm, Package thermal management supported + 6, 0, EAX, 7, hwp, HWP base register + 6, 0, EAX, 8, hwp_notify, HWP notification + 6, 0, EAX, 9, hwp_act_window, HWP activity window + 6, 0, EAX, 10, hwp_energy, HWP energy performance preference + 6, 0, EAX, 11, hwp_pkg_req, HWP package level request +# 6, 0, EAX, 12, resv, Reserved + 6, 0, EAX, 13, hdc, HDC base registers supported + 6, 0, EAX, 14, turbo3, Turbo Boost Max 3.0 + 6, 0, EAX, 15, hwp_cap, Highest Performance change supported + 6, 0, EAX, 16, hwp_peci, HWP PECI override is supported + 6, 0, EAX, 17, hwp_flex, Flexible HWP is supported + 6, 0, EAX, 18, hwp_fast, Fast access mode for the IA32_HWP_REQUEST MSR is supported +# 6, 0, EAX, 19, resv, Reserved + 6, 0, EAX, 20, hwp_ignr, Ignoring Idle Logical Processor HWP request is supported + + 6, 0, EBX, 3:0, therm_irq_thresh, Number of Interrupt Thresholds in Digital Thermal Sensor + 6, 0, ECX, 0, aperfmperf, Presence of IA32_MPERF and IA32_APERF + 6, 0, ECX, 3, energ_bias, Performance-energy bias preference supported + +# Leaf 07H +# ECX == 0 +# AVX512 refers to https://en.wikipedia.org/wiki/AVX-512 +# XXX: Do we really need to enumerate each and every AVX512 sub features + + 7, 0, EBX, 0, fsgsbase, RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE supported + 7, 0, EBX, 1, tsc_adjust, TSC_ADJUST MSR supported + 7, 0, EBX, 2, sgx, Software Guard Extensions + 7, 0, EBX, 3, bmi1, BMI1 + 7, 0, EBX, 4, hle, Hardware Lock Elision + 7, 0, EBX, 5, avx2, AVX2 +# 7, 0, EBX, 6, fdp_excp_only, x87 FPU Data Pointer updated only on x87 exceptions + 7, 0, EBX, 7, smep, Supervisor-Mode Execution Prevention + 7, 0, EBX, 8, bmi2, BMI2 + 7, 0, EBX, 9, rep_movsb, Enhanced REP MOVSB/STOSB + 7, 0, EBX, 10, invpcid, INVPCID instruction + 7, 0, EBX, 11, rtm, Restricted Transactional Memory + 7, 0, EBX, 12, rdt_m, Intel RDT Monitoring capability + 7, 0, EBX, 13, depc_fpu_cs_ds, Deprecates FPU CS and FPU DS + 7, 0, EBX, 14, mpx, Memory Protection Extensions + 7, 0, EBX, 15, rdt_a, Intel RDT Allocation capability + 7, 0, EBX, 16, avx512f, AVX512 Foundation instr + 7, 0, EBX, 17, avx512dq, AVX512 Double and Quadword AVX512 instr + 7, 0, EBX, 18, rdseed, RDSEED instr + 7, 0, EBX, 19, adx, ADX instr + 7, 0, EBX, 20, smap, Supervisor Mode Access Prevention + 7, 0, EBX, 21, avx512ifma, AVX512 Integer Fused Multiply Add +# 7, 0, EBX, 22, resvd, resvd + 7, 0, EBX, 23, clflushopt, CLFLUSHOPT instr + 7, 0, EBX, 24, clwb, CLWB instr + 7, 0, EBX, 25, intel_pt, Intel Processor Trace instr + 7, 0, EBX, 26, avx512pf, Prefetch + 7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr + 7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr + 7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr + 7, 0, EBX, 26, avx512bw, AVX512 Byte & Word instr + 7, 0, EBX, 28, avx512vl, AVX512 Vector Length Extentions (VL) + 7, 0, ECX, 0, prefetchwt1, X + 7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions + 7, 0, ECX, 2, umip, User-mode Instruction Prevention + + 7, 0, ECX, 3, pku, Protection Keys for User-mode pages + 7, 0, ECX, 4, ospke, CR4 PKE set to enable protection keys +# 7, 0, ECX, 16:5, resvd, resvd + 7, 0, ECX, 21:17, mawau, The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode + 7, 0, ECX, 22, rdpid, RDPID and IA32_TSC_AUX +# 7, 0, ECX, 29:23, resvd, resvd + 7, 0, ECX, 30, sgx_lc, SGX Launch Configuration +# 7, 0, ECX, 31, resvd, resvd + +# Leaf 08H +# + + +# Leaf 09H +# Direct Cache Access (DCA) information + 9, 0, ECX, 31:0, dca_cap, The value of IA32_PLATFORM_DCA_CAP + +# Leaf 0AH +# Architectural Performance Monitoring +# +# Do we really need to print out the PMU related stuff? +# Does normal user really care about it? +# + 0xA, 0, EAX, 7:0, pmu_ver, Performance Monitoring Unit version + 0xA, 0, EAX, 15:8, pmu_gp_cnt_num, Numer of general-purose PMU counters per logical CPU + 0xA, 0, EAX, 23:16, pmu_cnt_bits, Bit wideth of PMU counter + 0xA, 0, EAX, 31:24, pmu_ebx_bits, Length of EBX bit vector to enumerate PMU events + + 0xA, 0, EBX, 0, pmu_no_core_cycle_evt, Core cycle event not available + 0xA, 0, EBX, 1, pmu_no_instr_ret_evt, Instruction retired event not available + 0xA, 0, EBX, 2, pmu_no_ref_cycle_evt, Reference cycles event not available + 0xA, 0, EBX, 3, pmu_no_llc_ref_evt, Last-level cache reference event not available + 0xA, 0, EBX, 4, pmu_no_llc_mis_evt, Last-level cache misses event not available + 0xA, 0, EBX, 5, pmu_no_br_instr_ret_evt, Branch instruction retired event not available + 0xA, 0, EBX, 6, pmu_no_br_mispredict_evt, Branch mispredict retired event not available + + 0xA, 0, ECX, 4:0, pmu_fixed_cnt_num, Performance Monitoring Unit version + 0xA, 0, ECX, 12:5, pmu_fixed_cnt_bits, Numer of PMU counters per logical CPU + +# Leaf 0BH +# Extended Topology Enumeration Leaf +# + + 0xB, 0, EAX, 4:0, id_shift, Number of bits to shift right on x2APIC ID to get a unique topology ID of the next level type + 0xB, 0, EBX, 15:0, cpu_nr, Number of logical processors at this level type + 0xB, 0, ECX, 15:8, lvl_type, 0-Invalid 1-SMT 2-Core + 0xB, 0, EDX, 31:0, x2apic_id, x2APIC ID the current logical processor + + +# Leaf 0DH +# Processor Extended State + + 0xD, 0, EAX, 0, x87, X87 state + 0xD, 0, EAX, 1, sse, SSE state + 0xD, 0, EAX, 2, avx, AVX state + 0xD, 0, EAX, 4:3, mpx, MPX state + 0xD, 0, EAX, 7:5, avx512, AVX-512 state + 0xD, 0, EAX, 9, pkru, PKRU state + + 0xD, 0, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 + 0xD, 0, ECX, 31:0, max_sz_xsave, Maximum size (bytes) of the XSAVE/XRSTOR save area + + 0xD, 1, EAX, 0, xsaveopt, XSAVEOPT available + 0xD, 1, EAX, 1, xsavec, XSAVEC and compacted form supported + 0xD, 1, EAX, 2, xgetbv, XGETBV supported + 0xD, 1, EAX, 3, xsaves, XSAVES/XRSTORS and IA32_XSS supported + + 0xD, 1, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 + 0xD, 1, ECX, 8, pt, PT state + 0xD, 1, ECX, 11, cet_usr, CET user state + 0xD, 1, ECX, 12, cet_supv, CET supervisor state + 0xD, 1, ECX, 13, hdc, HDC state + 0xD, 1, ECX, 16, hwp, HWP state + +# Leaf 0FH +# Intel RDT Monitoring + + 0xF, 0, EBX, 31:0, rmid_range, Maximum range (zero-based) of RMID within this physical processor of all types + 0xF, 0, EDX, 1, l3c_rdt_mon, L3 Cache RDT Monitoring supported + + 0xF, 1, ECX, 31:0, rmid_range, Maximum range (zero-based) of RMID of this types + 0xF, 1, EDX, 0, l3c_ocp_mon, L3 Cache occupancy Monitoring supported + 0xF, 1, EDX, 1, l3c_tbw_mon, L3 Cache Total Bandwidth Monitoring supported + 0xF, 1, EDX, 2, l3c_lbw_mon, L3 Cache Local Bandwidth Monitoring supported + +# Leaf 10H +# Intel RDT Allocation + + 0x10, 0, EBX, 1, l3c_rdt_alloc, L3 Cache Allocation supported + 0x10, 0, EBX, 2, l2c_rdt_alloc, L2 Cache Allocation supported + 0x10, 0, EBX, 3, mem_bw_alloc, Memory Bandwidth Allocation supported + + +# Leaf 12H +# SGX Capability +# +# Some detailed SGX features not added yet + + 0x12, 0, EAX, 0, sgx1, L3 Cache Allocation supported + 0x12, 1, EAX, 0, sgx2, L3 Cache Allocation supported + + +# Leaf 14H +# Intel Processor Tracer +# + +# Leaf 15H +# Time Stamp Counter and Nominal Core Crystal Clock Information + + 0x15, 0, EAX, 31:0, tsc_denominator, The denominator of the TSC/”core crystal clock” ratio + 0x15, 0, EBX, 31:0, tsc_numerator, The numerator of the TSC/”core crystal clock” ratio + 0x15, 0, ECX, 31:0, nom_freq, Nominal frequency of the core crystal clock in Hz + +# Leaf 16H +# Processor Frequency Information + + 0x16, 0, EAX, 15:0, cpu_base_freq, Processor Base Frequency in MHz + 0x16, 0, EBX, 15:0, cpu_max_freq, Maximum Frequency in MHz + 0x16, 0, ECX, 15:0, bus_freq, Bus (Reference) Frequency in MHz + +# Leaf 17H +# System-On-Chip Vendor Attribute + + 0x17, 0, EAX, 31:0, max_socid, Maximum input value of supported sub-leaf + 0x17, 0, EBX, 15:0, soc_vid, SOC Vendor ID + 0x17, 0, EBX, 16, std_vid, SOC Vendor ID is assigned via an industry standard scheme + 0x17, 0, ECX, 31:0, soc_pid, SOC Project ID assigned by vendor + 0x17, 0, EDX, 31:0, soc_sid, SOC Stepping ID + +# Leaf 18H +# Deterministic Address Translation Parameters + + +# Leaf 19H +# Key Locker Leaf + + +# Leaf 1AH +# Hybrid Information + + 0x1A, 0, EAX, 31:24, core_type, 20H-Intel_Atom 40H-Intel_Core + + +# Leaf 1FH +# V2 Extended Topology - A preferred superset to leaf 0BH + + +# According to SDM +# 40000000H - 4FFFFFFFH is invalid range + + +# Leaf 80000001H +# Extended Processor Signature and Feature Bits + +0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode +0x80000001, 0, ECX, 5, lzcnt, LZCNT +0x80000001, 0, ECX, 8, prefetchw, PREFETCHW + +0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported +0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available +0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported +0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available +#0x80000001, 0, EDX, 29, 64b, 64b Architecture supported + +# Leaf 80000002H/80000003H/80000004H +# Processor Brand String + +# Leaf 80000005H +# Reserved + +# Leaf 80000006H +# Extended L2 Cache Features + +0x80000006, 0, ECX, 7:0, clsize, Cache Line size in bytes +0x80000006, 0, ECX, 15:12, l2c_assoc, L2 Associativity +0x80000006, 0, ECX, 31:16, csize, Cache size in 1K units + + +# Leaf 80000007H + +0x80000007, 0, EDX, 8, nonstop_tsc, Invariant TSC available + + +# Leaf 80000008H + +0x80000008, 0, EAX, 7:0, phy_adr_bits, Physical Address Bits +0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits +0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD + +# 0x8000001E +# EAX: Extended APIC ID +0x8000001E, 0, EAX, 31:0, extended_apic_id, Extended APIC ID +# EBX: Core Identifiers +0x8000001E, 0, EBX, 7:0, core_id, Identifies the logical core ID +0x8000001E, 0, EBX, 15:8, threads_per_core, The number of threads per core is threads_per_core + 1 +# ECX: Node Identifiers +0x8000001E, 0, ECX, 7:0, node_id, Node ID +0x8000001E, 0, ECX, 10:8, nodes_per_processor, Nodes per processor { 0: 1 node, else reserved } + +# 8000001F: AMD Secure Encryption +0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption +0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization +0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR +0x8000001F, 0, EAX, 3, seves, SEV Encrypted State +0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption +0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled +0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest +0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c new file mode 100644 index 000000000000..dae75511fef7 --- /dev/null +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <getopt.h> + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +typedef unsigned int u32; +typedef unsigned long long u64; + +char *def_csv = "/usr/share/misc/cpuid.csv"; +char *user_csv; + + +/* Cover both single-bit flag and multiple-bits fields */ +struct bits_desc { + /* start and end bits */ + int start, end; + /* 0 or 1 for 1-bit flag */ + int value; + char simp[32]; + char detail[256]; +}; + +/* descriptor info for eax/ebx/ecx/edx */ +struct reg_desc { + /* number of valid entries */ + int nr; + struct bits_desc descs[32]; +}; + +enum { + R_EAX = 0, + R_EBX, + R_ECX, + R_EDX, + NR_REGS +}; + +struct subleaf { + u32 index; + u32 sub; + u32 eax, ebx, ecx, edx; + struct reg_desc info[NR_REGS]; +}; + +/* Represent one leaf (basic or extended) */ +struct cpuid_func { + /* + * Array of subleafs for this func, if there is no subleafs + * then the leafs[0] is the main leaf + */ + struct subleaf *leafs; + int nr; +}; + +struct cpuid_range { + /* array of main leafs */ + struct cpuid_func *funcs; + /* number of valid leafs */ + int nr; + bool is_ext; +}; + +/* + * basic: basic functions range: [0... ] + * ext: extended functions range: [0x80000000... ] + */ +struct cpuid_range *leafs_basic, *leafs_ext; + +static int num_leafs; +static bool is_amd; +static bool show_details; +static bool show_raw; +static bool show_flags_only = true; +static u32 user_index = 0xFFFFFFFF; +static u32 user_sub = 0xFFFFFFFF; +static int flines; + +static inline void cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline bool has_subleafs(u32 f) +{ + if (f == 0x7 || f == 0xd) + return true; + + if (is_amd) { + if (f == 0x8000001d) + return true; + return false; + } + + switch (f) { + case 0x4: + case 0xb: + case 0xf: + case 0x10: + case 0x14: + case 0x18: + case 0x1f: + return true; + default: + return false; + } +} + +static void leaf_print_raw(struct subleaf *leaf) +{ + if (has_subleafs(leaf->index)) { + if (leaf->sub == 0) + printf("0x%08x: subleafs:\n", leaf->index); + + printf(" %2d: EAX=0x%08x, EBX=0x%08x, ECX=0x%08x, EDX=0x%08x\n", + leaf->sub, leaf->eax, leaf->ebx, leaf->ecx, leaf->edx); + } else { + printf("0x%08x: EAX=0x%08x, EBX=0x%08x, ECX=0x%08x, EDX=0x%08x\n", + leaf->index, leaf->eax, leaf->ebx, leaf->ecx, leaf->edx); + } +} + +/* Return true is the input eax/ebx/ecx/edx are all zero */ +static bool cpuid_store(struct cpuid_range *range, u32 f, int subleaf, + u32 a, u32 b, u32 c, u32 d) +{ + struct cpuid_func *func; + struct subleaf *leaf; + int s = 0; + + if (a == 0 && b == 0 && c == 0 && d == 0) + return true; + + /* + * Cut off vendor-prefix from CPUID function as we're using it as an + * index into ->funcs. + */ + func = &range->funcs[f & 0xffff]; + + if (!func->leafs) { + func->leafs = malloc(sizeof(struct subleaf)); + if (!func->leafs) + perror("malloc func leaf"); + + func->nr = 1; + } else { + s = func->nr; + func->leafs = realloc(func->leafs, (s + 1) * sizeof(*leaf)); + if (!func->leafs) + perror("realloc f->leafs"); + + func->nr++; + } + + leaf = &func->leafs[s]; + + leaf->index = f; + leaf->sub = subleaf; + leaf->eax = a; + leaf->ebx = b; + leaf->ecx = c; + leaf->edx = d; + + return false; +} + +static void raw_dump_range(struct cpuid_range *range) +{ + u32 f; + int i; + + printf("%s Leafs :\n", range->is_ext ? "Extended" : "Basic"); + printf("================\n"); + + for (f = 0; (int)f < range->nr; f++) { + struct cpuid_func *func = &range->funcs[f]; + u32 index = f; + + if (range->is_ext) + index += 0x80000000; + + /* Skip leaf without valid items */ + if (!func->nr) + continue; + + /* First item is the main leaf, followed by all subleafs */ + for (i = 0; i < func->nr; i++) + leaf_print_raw(&func->leafs[i]); + } +} + +#define MAX_SUBLEAF_NUM 32 +struct cpuid_range *setup_cpuid_range(u32 input_eax) +{ + u32 max_func, idx_func; + int subleaf; + struct cpuid_range *range; + u32 eax, ebx, ecx, edx; + u32 f = input_eax; + int max_subleaf; + bool allzero; + + eax = input_eax; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + max_func = eax; + idx_func = (max_func & 0xffff) + 1; + + range = malloc(sizeof(struct cpuid_range)); + if (!range) + perror("malloc range"); + + if (input_eax & 0x80000000) + range->is_ext = true; + else + range->is_ext = false; + + range->funcs = malloc(sizeof(struct cpuid_func) * idx_func); + if (!range->funcs) + perror("malloc range->funcs"); + + range->nr = idx_func; + memset(range->funcs, 0, sizeof(struct cpuid_func) * idx_func); + + for (; f <= max_func; f++) { + eax = f; + subleaf = ecx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + allzero = cpuid_store(range, f, subleaf, eax, ebx, ecx, edx); + if (allzero) + continue; + num_leafs++; + + if (!has_subleafs(f)) + continue; + + max_subleaf = MAX_SUBLEAF_NUM; + + /* + * Some can provide the exact number of subleafs, + * others have to be tried (0xf) + */ + if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) + max_subleaf = (eax & 0xff) + 1; + + if (f == 0xb) + max_subleaf = 2; + + for (subleaf = 1; subleaf < max_subleaf; subleaf++) { + eax = f; + ecx = subleaf; + + cpuid(&eax, &ebx, &ecx, &edx); + allzero = cpuid_store(range, f, subleaf, + eax, ebx, ecx, edx); + if (allzero) + continue; + num_leafs++; + } + + } + + return range; +} + +/* + * The basic row format for cpuid.csv is + * LEAF,SUBLEAF,register_name,bits,short name,long description + * + * like: + * 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs + * 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) + */ +static int parse_line(char *line) +{ + char *str; + int i; + struct cpuid_range *range; + struct cpuid_func *func; + struct subleaf *leaf; + u32 index; + u32 sub; + char buffer[512]; + char *buf; + /* + * Tokens: + * 1. leaf + * 2. subleaf + * 3. register + * 4. bits + * 5. short name + * 6. long detail + */ + char *tokens[6]; + struct reg_desc *reg; + struct bits_desc *bdesc; + int reg_index; + char *start, *end; + + /* Skip comments and NULL line */ + if (line[0] == '#' || line[0] == '\n') + return 0; + + strncpy(buffer, line, 511); + buffer[511] = 0; + str = buffer; + for (i = 0; i < 5; i++) { + tokens[i] = strtok(str, ","); + if (!tokens[i]) + goto err_exit; + str = NULL; + } + tokens[5] = strtok(str, "\n"); + if (!tokens[5]) + goto err_exit; + + /* index/main-leaf */ + index = strtoull(tokens[0], NULL, 0); + + if (index & 0x80000000) + range = leafs_ext; + else + range = leafs_basic; + + index &= 0x7FFFFFFF; + /* Skip line parsing for non-existing indexes */ + if ((int)index >= range->nr) + return -1; + + func = &range->funcs[index]; + + /* Return if the index has no valid item on this platform */ + if (!func->nr) + return 0; + + /* subleaf */ + sub = strtoul(tokens[1], NULL, 0); + if ((int)sub > func->nr) + return -1; + + leaf = &func->leafs[sub]; + buf = tokens[2]; + + if (strcasestr(buf, "EAX")) + reg_index = R_EAX; + else if (strcasestr(buf, "EBX")) + reg_index = R_EBX; + else if (strcasestr(buf, "ECX")) + reg_index = R_ECX; + else if (strcasestr(buf, "EDX")) + reg_index = R_EDX; + else + goto err_exit; + + reg = &leaf->info[reg_index]; + bdesc = ®->descs[reg->nr++]; + + /* bit flag or bits field */ + buf = tokens[3]; + + end = strtok(buf, ":"); + bdesc->end = strtoul(end, NULL, 0); + bdesc->start = bdesc->end; + + /* start != NULL means it is bit fields */ + start = strtok(NULL, ":"); + if (start) + bdesc->start = strtoul(start, NULL, 0); + + strcpy(bdesc->simp, tokens[4]); + strcpy(bdesc->detail, tokens[5]); + return 0; + +err_exit: + printf("Warning: wrong line format:\n"); + printf("\tline[%d]: %s\n", flines, line); + return -1; +} + +/* Parse csv file, and construct the array of all leafs and subleafs */ +static void parse_text(void) +{ + FILE *file; + char *filename, *line = NULL; + size_t len = 0; + int ret; + + if (show_raw) + return; + + filename = user_csv ? user_csv : def_csv; + file = fopen(filename, "r"); + if (!file) { + /* Fallback to a csv in the same dir */ + file = fopen("./cpuid.csv", "r"); + } + + if (!file) { + printf("Fail to open '%s'\n", filename); + return; + } + + while (1) { + ret = getline(&line, &len, file); + flines++; + if (ret > 0) + parse_line(line); + + if (feof(file)) + break; + } + + fclose(file); +} + + +/* Decode every eax/ebx/ecx/edx */ +static void decode_bits(u32 value, struct reg_desc *rdesc) +{ + struct bits_desc *bdesc; + int start, end, i; + u32 mask; + + for (i = 0; i < rdesc->nr; i++) { + bdesc = &rdesc->descs[i]; + + start = bdesc->start; + end = bdesc->end; + if (start == end) { + /* single bit flag */ + if (value & (1 << start)) + printf("\t%-20s %s%s\n", + bdesc->simp, + show_details ? "-" : "", + show_details ? bdesc->detail : "" + ); + } else { + /* bit fields */ + if (show_flags_only) + continue; + + mask = ((u64)1 << (end - start + 1)) - 1; + printf("\t%-20s\t: 0x%-8x\t%s%s\n", + bdesc->simp, + (value >> start) & mask, + show_details ? "-" : "", + show_details ? bdesc->detail : "" + ); + } + } +} + +static void show_leaf(struct subleaf *leaf) +{ + if (!leaf) + return; + + if (show_raw) + leaf_print_raw(leaf); + + decode_bits(leaf->eax, &leaf->info[R_EAX]); + decode_bits(leaf->ebx, &leaf->info[R_EBX]); + decode_bits(leaf->ecx, &leaf->info[R_ECX]); + decode_bits(leaf->edx, &leaf->info[R_EDX]); +} + +static void show_func(struct cpuid_func *func) +{ + int i; + + if (!func) + return; + + for (i = 0; i < func->nr; i++) + show_leaf(&func->leafs[i]); +} + +static void show_range(struct cpuid_range *range) +{ + int i; + + for (i = 0; i < range->nr; i++) + show_func(&range->funcs[i]); +} + +static inline struct cpuid_func *index_to_func(u32 index) +{ + struct cpuid_range *range; + + range = (index & 0x80000000) ? leafs_ext : leafs_basic; + index &= 0x7FFFFFFF; + + if (((index & 0xFFFF) + 1) > (u32)range->nr) { + printf("ERR: invalid input index (0x%x)\n", index); + return NULL; + } + return &range->funcs[index]; +} + +static void show_info(void) +{ + struct cpuid_func *func; + + if (show_raw) { + /* Show all of the raw output of 'cpuid' instr */ + raw_dump_range(leafs_basic); + raw_dump_range(leafs_ext); + return; + } + + if (user_index != 0xFFFFFFFF) { + /* Only show specific leaf/subleaf info */ + func = index_to_func(user_index); + if (!func) + return; + + /* Dump the raw data also */ + show_raw = true; + + if (user_sub != 0xFFFFFFFF) { + if (user_sub + 1 <= (u32)func->nr) { + show_leaf(&func->leafs[user_sub]); + return; + } + + printf("ERR: invalid input subleaf (0x%x)\n", user_sub); + } + + show_func(func); + return; + } + + printf("CPU features:\n=============\n\n"); + show_range(leafs_basic); + show_range(leafs_ext); +} + +static void setup_platform_cpuid(void) +{ + u32 eax, ebx, ecx, edx; + + /* Check vendor */ + eax = ebx = ecx = edx = 0; + cpuid(&eax, &ebx, &ecx, &edx); + + /* "htuA" */ + if (ebx == 0x68747541) + is_amd = true; + + /* Setup leafs for the basic and extended range */ + leafs_basic = setup_cpuid_range(0x0); + leafs_ext = setup_cpuid_range(0x80000000); +} + +static void usage(void) +{ + printf("kcpuid [-abdfhr] [-l leaf] [-s subleaf]\n" + "\t-a|--all Show both bit flags and complex bit fields info\n" + "\t-b|--bitflags Show boolean flags only\n" + "\t-d|--detail Show details of the flag/fields (default)\n" + "\t-f|--flags Specify the cpuid csv file\n" + "\t-h|--help Show usage info\n" + "\t-l|--leaf=index Specify the leaf you want to check\n" + "\t-r|--raw Show raw cpuid data\n" + "\t-s|--subleaf=sub Specify the subleaf you want to check\n" + ); +} + +static struct option opts[] = { + { "all", no_argument, NULL, 'a' }, /* show both bit flags and fields */ + { "bitflags", no_argument, NULL, 'b' }, /* only show bit flags, default on */ + { "detail", no_argument, NULL, 'd' }, /* show detail descriptions */ + { "file", required_argument, NULL, 'f' }, /* use user's cpuid file */ + { "help", no_argument, NULL, 'h'}, /* show usage */ + { "leaf", required_argument, NULL, 'l'}, /* only check a specific leaf */ + { "raw", no_argument, NULL, 'r'}, /* show raw CPUID leaf data */ + { "subleaf", required_argument, NULL, 's'}, /* check a specific subleaf */ + { NULL, 0, NULL, 0 } +}; + +static int parse_options(int argc, char *argv[]) +{ + int c; + + while ((c = getopt_long(argc, argv, "abdf:hl:rs:", + opts, NULL)) != -1) + switch (c) { + case 'a': + show_flags_only = false; + break; + case 'b': + show_flags_only = true; + break; + case 'd': + show_details = true; + break; + case 'f': + user_csv = optarg; + break; + case 'h': + usage(); + exit(1); + break; + case 'l': + /* main leaf */ + user_index = strtoul(optarg, NULL, 0); + break; + case 'r': + show_raw = true; + break; + case 's': + /* subleaf */ + user_sub = strtoul(optarg, NULL, 0); + break; + default: + printf("%s: Invalid option '%c'\n", argv[0], optopt); + return -1; + } + + return 0; +} + +/* + * Do 4 things in turn: + * 1. Parse user options + * 2. Parse and store all the CPUID leaf data supported on this platform + * 2. Parse the csv file, while skipping leafs which are not available + * on this platform + * 3. Print leafs info based on user options + */ +int main(int argc, char *argv[]) +{ + if (parse_options(argc, argv)) + return -1; + + /* Setup the cpuid leafs of current platform */ + setup_platform_cpuid(); + + /* Read and parse the 'cpuid.csv' */ + parse_text(); + + show_info(); + return 0; +} diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index 4f5ed49e1b4e..dfbcc6405941 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -4,7 +4,7 @@ * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ -#include "../include/asm/insn.h" +#include "../include/asm/insn.h" /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index 3d9355ed1246..c41f95815480 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -11,10 +11,13 @@ #else #include <string.h> #endif -#include "../include/asm/inat.h" -#include "../include/asm/insn.h" +#include "../include/asm/inat.h" /* __ignore_sync_check__ */ +#include "../include/asm/insn.h" /* __ignore_sync_check__ */ -#include "../include/asm/emulate_prefix.h" +#include <linux/errno.h> +#include <linux/kconfig.h> + +#include "../include/asm/emulate_prefix.h" /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ @@ -51,6 +54,7 @@ * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) @@ -111,8 +115,12 @@ static void insn_get_emulate_prefix(struct insn *insn) * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. + * + * * Returns: + * 0: on success + * < 0: on error */ -void insn_get_prefixes(struct insn *insn) +int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; @@ -120,7 +128,7 @@ void insn_get_prefixes(struct insn *insn) int i, nb; if (prefixes->got) - return; + return 0; insn_get_emulate_prefix(insn); @@ -230,8 +238,10 @@ vex_end: prefixes->got = 1; + return 0; + err_out: - return; + return -ENODATA; } /** @@ -243,16 +253,25 @@ err_out: * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_opcode(struct insn *insn) +int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; + int pfx_id, ret; insn_byte_t op; - int pfx_id; + if (opcode->got) - return; - if (!insn->prefixes.got) - insn_get_prefixes(insn); + return 0; + + if (!insn->prefixes.got) { + ret = insn_get_prefixes(insn); + if (ret) + return ret; + } /* Get first opcode */ op = get_next(insn_byte_t, insn); @@ -267,9 +286,13 @@ void insn_get_opcode(struct insn *insn) insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && - !inat_is_group(insn->attr))) - insn->attr = 0; /* This instruction is bad */ - goto end; /* VEX has only 1 byte for opcode */ + !inat_is_group(insn->attr))) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } + /* VEX has only 1 byte for opcode */ + goto end; } insn->attr = inat_get_opcode_attribute(op); @@ -280,13 +303,18 @@ void insn_get_opcode(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } - if (inat_must_vex(insn->attr)) - insn->attr = 0; /* This instruction is bad */ + + if (inat_must_vex(insn->attr)) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } end: opcode->got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -296,15 +324,25 @@ err_out: * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_modrm(struct insn *insn) +int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; + int ret; + if (modrm->got) - return; - if (!insn->opcode.got) - insn_get_opcode(insn); + return 0; + + if (!insn->opcode.got) { + ret = insn_get_opcode(insn); + if (ret) + return ret; + } if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); @@ -313,17 +351,22 @@ void insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) - insn->attr = 0; /* This is bad */ + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + /* Bad insn */ + insn->attr = 0; + return -EINVAL; + } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; + modrm->got = 1; + return 0; err_out: - return; + return -ENODATA; } @@ -337,11 +380,16 @@ err_out: int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; + int ret; if (!insn->x86_64) return 0; - if (!modrm->got) - insn_get_modrm(insn); + + if (!modrm->got) { + ret = insn_get_modrm(insn); + if (ret) + return 0; + } /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. @@ -355,15 +403,25 @@ int insn_rip_relative(struct insn *insn) * * If necessary, first collects the instruction up to and including the * ModRM byte. + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_sib(struct insn *insn) +int insn_get_sib(struct insn *insn) { insn_byte_t modrm; + int ret; if (insn->sib.got) - return; - if (!insn->modrm.got) - insn_get_modrm(insn); + return 0; + + if (!insn->modrm.got) { + ret = insn_get_modrm(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && @@ -374,8 +432,10 @@ void insn_get_sib(struct insn *insn) } insn->sib.got = 1; + return 0; + err_out: - return; + return -ENODATA; } @@ -386,15 +446,25 @@ err_out: * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. + * + * * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_displacement(struct insn *insn) +int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; + int ret; if (insn->displacement.got) - return; - if (!insn->sib.got) - insn_get_sib(insn); + return 0; + + if (!insn->sib.got) { + ret = insn_get_sib(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: @@ -436,9 +506,10 @@ void insn_get_displacement(struct insn *insn) } out: insn->displacement.got = 1; + return 0; err_out: - return; + return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ @@ -537,20 +608,30 @@ err_out: } /** - * insn_get_immediate() - Get the immediates of instruction + * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be - * get by bit masking with ((1 << (nbytes * 8)) - 1) + * computed by bit masking with ((1 << (nbytes * 8)) - 1) + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_immediate(struct insn *insn) +int insn_get_immediate(struct insn *insn) { + int ret; + if (insn->immediate.got) - return; - if (!insn->displacement.got) - insn_get_displacement(insn); + return 0; + + if (!insn->displacement.got) { + ret = insn_get_displacement(insn); + if (ret) + return ret; + } if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) @@ -597,9 +678,10 @@ void insn_get_immediate(struct insn *insn) } done: insn->immediate.got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -608,13 +690,65 @@ err_out: * * If necessary, first collects the instruction up to and including the * immediates bytes. - */ -void insn_get_length(struct insn *insn) + * + * Returns: + * - 0 on success + * - < 0 on error +*/ +int insn_get_length(struct insn *insn) { + int ret; + if (insn->length) - return; - if (!insn->immediate.got) - insn_get_immediate(insn); + return 0; + + if (!insn->immediate.got) { + ret = insn_get_immediate(insn); + if (ret) + return ret; + } + insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); + + return 0; +} + +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + +/** + * insn_decode() - Decode an x86 instruction + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr + * @m: insn mode, see enum insn_mode + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. + */ +int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) +{ + int ret; + +#define INSN_MODE_KERN (enum insn_mode)-1 /* __ignore_sync_check__ mode is only valid in the kernel */ + + if (m == INSN_MODE_KERN) + insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); + else + insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); + + ret = insn_get_length(insn); + if (ret) + return ret; + + if (insn_complete(insn)) + return 0; + + return -EINVAL; } diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers deleted file mode 100644 index 854d084026dd..000000000000 --- a/tools/bpf/Makefile.helpers +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -ifndef allow-override - include ../scripts/Makefile.include - include ../scripts/utilities.mak -else - # Assume Makefile.helpers is being run from bpftool/Documentation - # subdirectory. Go up two more directories to fetch bpf.h header and - # associated script. - UP2DIR := ../../ -endif - -INSTALL ?= install -RM ?= rm -f -RMDIR ?= rmdir --ignore-fail-on-non-empty - -ifeq ($(V),1) - Q = -else - Q = @ -endif - -prefix ?= /usr/local -mandir ?= $(prefix)/man -man7dir = $(mandir)/man7 - -HELPERS_RST = bpf-helpers.rst -MAN7_RST = $(HELPERS_RST) - -_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) -DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) - -helpers: man7 -man7: $(DOC_MAN7) - -RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) - -$(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@ - -$(OUTPUT)%.7: $(OUTPUT)%.rst -ifndef RST2MAN_DEP - $(error "rst2man not found, but required to generate man pages") -endif - $(QUIET_GEN)rst2man $< > $@ - -helpers-clean: - $(call QUIET_CLEAN, eBPF_helpers-manpage) - $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) - -helpers-install: helpers - $(call QUIET_INSTALL, eBPF_helpers-manpage) - $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) - $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) - -helpers-uninstall: - $(call QUIET_UNINST, eBPF_helpers-manpage) - $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) - $(Q)$(RMDIR) $(DESTDIR)$(man7dir) - -.PHONY: helpers helpers-clean helpers-install helpers-uninstall diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index a07dfc479270..00e560a17baf 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -1198,7 +1198,7 @@ static int cmd_run(char *num) else return CMD_OK; bpf_reset(); - } while (pcap_next_pkt() && (!has_limit || (has_limit && ++i < pkts))); + } while (pcap_next_pkt() && (!has_limit || (++i < pkts))); rl_printf("bpf passes:%u fails:%u\n", pass, fail); diff --git a/tools/bpf/bpf_exp.y b/tools/bpf/bpf_exp.y index 8d48e896be50..dfb7254a24e8 100644 --- a/tools/bpf/bpf_exp.y +++ b/tools/bpf/bpf_exp.y @@ -185,13 +185,13 @@ ldx | OP_LDXB number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } | OP_LDX number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } ; @@ -472,7 +472,7 @@ static void bpf_assert_max(void) { if (curr_instr >= BPF_MAXINSNS) { fprintf(stderr, "only max %u insns allowed!\n", BPF_MAXINSNS); - exit(0); + exit(1); } } @@ -522,7 +522,7 @@ static int bpf_find_insns_offset(const char *label) if (ret == -ENOENT) { fprintf(stderr, "no such label \'%s\'!\n", label); - exit(0); + exit(1); } return ret; @@ -549,9 +549,11 @@ static uint8_t bpf_encode_jt_jf_offset(int off, int i) { int delta = off - i - 1; - if (delta < 0 || delta > 255) - fprintf(stderr, "warning: insn #%d jumps to insn #%d, " + if (delta < 0 || delta > 255) { + fprintf(stderr, "error: insn #%d jumps to insn #%d, " "which is out of range\n", i, off); + exit(1); + } return (uint8_t) delta; } diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index 944cb4b7c95d..05ce4446b780 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -3,7 +3,6 @@ /bootstrap/ /bpftool bpftool*.8 -bpf-helpers.* FEATURE-DUMP.bpftool feature libbpf diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index f33cb02de95c..c49487905ceb 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -16,15 +16,12 @@ prefix ?= /usr/local mandir ?= $(prefix)/man man8dir = $(mandir)/man8 -# Load targets for building eBPF helpers man page. -include ../../Makefile.helpers - MAN8_RST = $(wildcard bpftool*.rst) _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) -man: man8 helpers +man: man8 man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) @@ -46,16 +43,16 @@ ifndef RST2MAN_DEP endif $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ -clean: helpers-clean +clean: $(call QUIET_CLEAN, Documentation) $(Q)$(RM) $(DOC_MAN8) -install: man helpers-install +install: man $(call QUIET_INSTALL, Documentation-man) $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir) -uninstall: helpers-uninstall +uninstall: $(call QUIET_UNINST, Documentation-man) $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8)) $(Q)$(RMDIR) $(DESTDIR)$(man8dir) diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 84cf0639696f..7cd6681137f3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,16 +14,37 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } - *COMMAND* := { **skeleton** | **help** } + *COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen skeleton** *FILE* +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] | **bpftool** **gen help** DESCRIPTION =========== + **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s + into a single resulting *OUTPUT_FILE*. All the files involved + are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for + user-space object files, but in addition to combining data + and instruction sections, .BTF and .BTF.ext (if present in + any of the input files) data are combined together. .BTF + data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting + BTF information. + + BPF static linking allows to partition BPF source code into + individually compiled files that are then linked into + a single resulting BPF object file, which can be used to + generated BPF skeleton (with **gen skeleton** command) or + passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + **bpftool gen skeleton** *FILE* Generate BPF skeleton C header file for a given *FILE*. @@ -75,10 +96,13 @@ DESCRIPTION specific maps, programs, etc. As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name, derived from - object file name. I.e., if BPF object file name is - **example.o**, BPF object name will be **example**. The - following custom functions are provided in such case: + Each of them is prefixed with object name. Object name can + either be derived from object file name, i.e., if BPF object + file name is **example.o**, BPF object name will be + **example**. Object name can be also specified explicitly + through **name** *OBJECT_NAME* parameter. The following + custom functions are provided (assuming **example** as + the object name): - **example__open** and **example__open_opts**. These functions are used to instantiate skeleton. It @@ -130,26 +154,19 @@ OPTIONS EXAMPLES ======== -**$ cat example.c** +**$ cat example1.bpf.c** :: #include <stdbool.h> #include <linux/ptrace.h> #include <linux/bpf.h> - #include "bpf_helpers.h" + #include <bpf/bpf_helpers.h> const volatile int param1 = 42; bool global_flag = true; struct { int x; } data = {}; - struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 128); - __type(key, int); - __type(value, long); - } my_map SEC(".maps"); - SEC("raw_tp/sys_enter") int handle_sys_enter(struct pt_regs *ctx) { @@ -161,6 +178,21 @@ EXAMPLES return 0; } +**$ cat example2.bpf.c** + +:: + + #include <linux/ptrace.h> + #include <linux/bpf.h> + #include <bpf/bpf_helpers.h> + + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, int); + __type(value, long); + } my_map SEC(".maps"); + SEC("raw_tp/sys_exit") int handle_sys_exit(struct pt_regs *ctx) { @@ -170,9 +202,17 @@ EXAMPLES } This is example BPF application with two BPF programs and a mix of BPF maps -and global variables. +and global variables. Source code is split across two source code files. -**$ bpftool gen skeleton example.o** +**$ clang -target bpf -g example1.bpf.c -o example1.bpf.o** +**$ clang -target bpf -g example2.bpf.c -o example2.bpf.o** +**$ bpftool gen object example.bpf.o example1.bpf.o example2.bpf.o** + +This set of commands compiles *example1.bpf.c* and *example2.bpf.c* +individually and then statically links respective object files into the final +BPF ELF object file *example.bpf.o*. + +**$ bpftool gen skeleton example.bpf.o name example | tee example.skel.h** :: @@ -227,7 +267,7 @@ and global variables. #endif /* __EXAMPLE_SKEL_H__ */ -**$ cat example_user.c** +**$ cat example.c** :: @@ -270,7 +310,7 @@ and global variables. return err; } -**# ./example_user** +**# ./example** :: diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index fdffbc64c65c..d67518bcbd44 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -981,12 +981,25 @@ _bpftool() ;; gen) case $command in - skeleton) + object) _filedir + return 0 + ;; + skeleton) + case $prev in + $command) + _filedir + return 0 + ;; + *) + _bpftool_once_attr 'name' + return 0 + ;; + esac ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'skeleton help' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'object skeleton help' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index fe9e7b3a4b50..385d5c955cf3 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -36,6 +36,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; struct btf_attach_table { @@ -70,7 +71,9 @@ static const char *btf_var_linkage_str(__u32 linkage) case BTF_VAR_STATIC: return "static"; case BTF_VAR_GLOBAL_ALLOCATED: - return "global-alloc"; + return "global"; + case BTF_VAR_GLOBAL_EXTERN: + return "extern"; default: return "(unknown)"; } @@ -97,26 +100,28 @@ static const char *btf_str(const struct btf *btf, __u32 off) return btf__name_by_offset(btf, off) ? : "(invalid)"; } +static int btf_kind_safe(int kind) +{ + return kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; +} + static int dump_btf_type(const struct btf *btf, __u32 id, const struct btf_type *t) { json_writer_t *w = json_wtr; - int kind, safe_kind; - - kind = BTF_INFO_KIND(t->info); - safe_kind = kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; + int kind = btf_kind(t); if (json_output) { jsonw_start_object(w); jsonw_uint_field(w, "id", id); - jsonw_string_field(w, "kind", btf_kind_str[safe_kind]); + jsonw_string_field(w, "kind", btf_kind_str[btf_kind_safe(kind)]); jsonw_string_field(w, "name", btf_str(btf, t->name_off)); } else { - printf("[%u] %s '%s'", id, btf_kind_str[safe_kind], + printf("[%u] %s '%s'", id, btf_kind_str[btf_kind_safe(kind)], btf_str(btf, t->name_off)); } - switch (BTF_INFO_KIND(t->info)) { + switch (kind) { case BTF_KIND_INT: { __u32 v = *(__u32 *)(t + 1); const char *enc; @@ -299,7 +304,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, break; } case BTF_KIND_DATASEC: { - const struct btf_var_secinfo *v = (const void *)(t+1); + const struct btf_var_secinfo *v = (const void *)(t + 1); + const struct btf_type *vt; __u16 vlen = BTF_INFO_VLEN(t->info); int i; @@ -321,12 +327,26 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } else { printf("\n\ttype_id=%u offset=%u size=%u", v->type, v->offset, v->size); + + if (v->type <= btf__get_nr_types(btf)) { + vt = btf__type_by_id(btf, v->type); + printf(" (%s '%s')", + btf_kind_str[btf_kind_safe(btf_kind(vt))], + btf_str(btf, vt->name_off)); + } } } if (json_output) jsonw_end_array(w); break; } + case BTF_KIND_FLOAT: { + if (json_output) + jsonw_uint_field(w, "size", t->size); + else + printf(" size=%u", t->size); + break; + } default: break; } @@ -538,6 +558,7 @@ static int do_dump(int argc, char **argv) NEXT_ARG(); if (argc < 1) { p_err("expecting value for 'format' option\n"); + err = -EINVAL; goto done; } if (strcmp(*argv, "c") == 0) { @@ -547,11 +568,13 @@ static int do_dump(int argc, char **argv) } else { p_err("unrecognized format specifier: '%s', possible values: raw, c", *argv); + err = -EINVAL; goto done; } NEXT_ARG(); } else { p_err("unrecognized option: '%s'", *argv); + err = -EINVAL; goto done; } } diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 0e9310727281..7ca54d046362 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -596,6 +596,7 @@ static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_INT: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off)); break; case BTF_KIND_STRUCT: diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 359960a8f1de..40a88df275f9 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -336,6 +336,10 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ { "CONFIG_BPF_JIT_ALWAYS_ON", }, + /* Kernel BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF", }, + /* Kernel module BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF_MODULES", }, /* cgroups */ { "CONFIG_CGROUPS", }, diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 4033c46d83e7..31ade77f5ef8 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -273,7 +273,7 @@ static int do_skeleton(int argc, char **argv) char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; size_t i, map_cnt = 0, prog_cnt = 0, file_sz, mmap_sz; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); - char obj_name[MAX_OBJ_NAME_LEN], *obj_data; + char obj_name[MAX_OBJ_NAME_LEN] = "", *obj_data; struct bpf_object *obj = NULL; const char *file, *ident; struct bpf_program *prog; @@ -288,6 +288,28 @@ static int do_skeleton(int argc, char **argv) } file = GET_ARG(); + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "name")) { + NEXT_ARG(); + + if (obj_name[0] != '\0') { + p_err("object name already specified"); + return -1; + } + + strncpy(obj_name, *argv, MAX_OBJ_NAME_LEN - 1); + obj_name[MAX_OBJ_NAME_LEN - 1] = '\0'; + } else { + p_err("unknown arg %s", *argv); + return -1; + } + + NEXT_ARG(); + } + if (argc) { p_err("extra unknown arguments"); return -1; @@ -310,7 +332,8 @@ static int do_skeleton(int argc, char **argv) p_err("failed to mmap() %s: %s", file, strerror(errno)); goto out; } - get_obj_name(obj_name, file); + if (obj_name[0] == '\0') + get_obj_name(obj_name, file); opts.object_name = obj_name; obj = bpf_object__open_mem(obj_data, file_sz, &opts); if (IS_ERR(obj)) { @@ -591,6 +614,47 @@ out: return err; } +static int do_object(int argc, char **argv) +{ + struct bpf_linker *linker; + const char *output_file, *file; + int err = 0; + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + output_file = GET_ARG(); + + linker = bpf_linker__new(output_file, NULL); + if (!linker) { + p_err("failed to create BPF linker instance"); + return -1; + } + + while (argc) { + file = GET_ARG(); + + err = bpf_linker__add_file(linker, file); + if (err) { + p_err("failed to link '%s': %s (%d)", file, strerror(err), err); + goto out; + } + } + + err = bpf_linker__finalize(linker); + if (err) { + p_err("failed to finalize ELF file: %s (%d)", strerror(err), err); + goto out; + } + + err = 0; +out: + bpf_linker__free(linker); + return err; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -599,7 +663,8 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %1$s %2$s skeleton FILE\n" + "Usage: %1$s %2$s object OUTPUT_FILE INPUT_FILE [INPUT_FILE...]\n" + " %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS "\n" @@ -610,6 +675,7 @@ static int do_help(int argc, char **argv) } static const struct cmd cmds[] = { + { "object", do_object }, { "skeleton", do_skeleton }, { "help", do_help }, { 0 } diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index b86f450e6fce..d9afb730136a 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -276,7 +276,7 @@ static int do_batch(int argc, char **argv) int n_argc; FILE *fp; char *cp; - int err; + int err = 0; int i; if (argc < 2) { @@ -370,7 +370,6 @@ static int do_batch(int argc, char **argv) } else { if (!json_output) printf("processed %d commands\n", lines); - err = 0; } err_close: if (fp != stdin) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index b400364ee054..09ae0381205b 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -100,7 +100,7 @@ static int do_dump_btf(const struct btf_dumper *d, void *value) { __u32 value_id; - int ret; + int ret = 0; /* start of key-value pair */ jsonw_start_object(d->jw); diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index ff3aa0cf3997..f836d115d7d6 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -157,7 +157,7 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, if (len == 0) break; - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len); nh = NLMSG_NEXT(nh, len)) { if (nh->nlmsg_pid != nl_pid) { ret = -LIBBPF_ERRNO__WRNGPID; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 8608cd68cdd0..6fc3e6f7f40c 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -196,6 +196,9 @@ static const char *print_imm(void *private_data, else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); + else if (insn->src_reg == BPF_PSEUDO_FUNC) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "subprog[%+d]", insn->imm); else snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "0x%llx", (unsigned long long)full_imm); diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 80d966cfcaa1..7550fd9c3188 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -115,10 +115,10 @@ struct object { static int verbose; -int eprintf(int level, int var, const char *fmt, ...) +static int eprintf(int level, int var, const char *fmt, ...) { va_list args; - int ret; + int ret = 0; if (var >= level) { va_start(args, fmt); @@ -385,7 +385,7 @@ static int elf_collect(struct object *obj) static int symbols_collect(struct object *obj) { Elf_Scn *scn = NULL; - int n, i, err = 0; + int n, i; GElf_Shdr sh; char *name; @@ -402,11 +402,10 @@ static int symbols_collect(struct object *obj) * Scan symbols and look for the ones starting with * __BTF_ID__* over .BTF_ids section. */ - for (i = 0; !err && i < n; i++) { - char *tmp, *prefix; + for (i = 0; i < n; i++) { + char *prefix; struct btf_id *id; GElf_Sym sym; - int err = -1; if (!gelf_getsym(obj->efile.symbols, i, &sym)) return -1; diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 9d9fb6209be1..3818ec511fd2 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -16,7 +16,10 @@ CFLAGS := -g -Wall # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) +VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../vmlinux /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(KERNEL_REL) VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) @@ -66,12 +69,16 @@ $(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): $(QUIET_MKDIR)mkdir -p $@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) +ifeq ($(VMLINUX_H),) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ "specify its location." >&2; \ exit 1;\ fi $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f044..645530ca7e98 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,20 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; - ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&start, &pid, &ts, 0); + ptr = bpf_task_storage_get(&start, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + *ptr = bpf_ktime_get_ns(); return 0; } @@ -43,7 +48,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +57,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +75,16 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; + /* For pid mismatch, save a bpf_task_storage_get */ + if (!pid || (targ_pid && targ_pid != pid)) + return 0; + /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(&start, &pid); + tsp = bpf_task_storage_get(&start, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ @@ -91,7 +100,7 @@ int handle__sched_switch(u64 *ctx) bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); - bpf_map_delete_elem(&start, &pid); + bpf_task_storage_delete(&start, next); return 0; } diff --git a/tools/build/Build.include b/tools/build/Build.include index 585486e40995..2cf3b1bde86e 100644 --- a/tools/build/Build.include +++ b/tools/build/Build.include @@ -100,3 +100,27 @@ cxx_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(CXXFLAGS) -D"BUILD_STR(s)=\#s" $(CXX ## HOSTCC C flags host_c_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(KBUILD_HOSTCFLAGS) -D"BUILD_STR(s)=\#s" $(HOSTCFLAGS_$(basetarget).o) $(HOSTCFLAGS_$(obj)) + +# output directory for tests below +TMPOUT = .tmp_$$$$ + +# try-run +# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise) +# Exit code chooses option. "$$TMP" serves as a temporary file and is +# automatically cleaned up. +try-run = $(shell set -e; \ + TMP=$(TMPOUT)/tmp; \ + mkdir -p $(TMPOUT); \ + trap "rm -rf $(TMPOUT)" EXIT; \ + if ($(1)) >/dev/null 2>&1; \ + then echo "$(2)"; \ + else echo "$(3)"; \ + fi) + +# cc-option +# Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) +cc-option = $(call try-run, \ + $(CC) -Werror $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + +# delete partially updated (i.e. corrupted) files on error +.DELETE_ON_ERROR: diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index c4225ed63565..1600b17dbb8a 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -128,9 +128,9 @@ def detect_kernel_config(): cfg['nr_nodes'] = prog['nr_online_nodes'].value_() - if prog.type('struct kmem_cache').members[1][1] == 'flags': + if prog.type('struct kmem_cache').members[1].name == 'flags': cfg['allocator'] = 'SLUB' - elif prog.type('struct kmem_cache').members[1][1] == 'batchcount': + elif prog.type('struct kmem_cache').members[1].name == 'batchcount': cfg['allocator'] = 'SLAB' else: err('Can\'t determine the slab allocator') @@ -193,7 +193,7 @@ def main(): # look over all slab pages, belonging to non-root memcgs # and look for objects belonging to the given memory cgroup for page in for_each_slab_page(prog): - objcg_vec_raw = page.obj_cgroups.value_() + objcg_vec_raw = page.memcg_data.value_() if objcg_vec_raw == 0: continue cache = page.slab_cache @@ -202,7 +202,7 @@ def main(): addr = cache.value_() caches[addr] = cache # clear the lowest bit to get the true obj_cgroups - objcg_vec = Object(prog, page.obj_cgroups.type_, + objcg_vec = Object(prog, 'struct obj_cgroup **', value=objcg_vec_raw & ~1) if addr not in stats: diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index 607b2b280945..719f18b1edf0 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -25,7 +25,7 @@ if [ "$1"x != "x" ]; then elif [ $1 -ge 0 ] 2>/dev/null ; then taint=$1 else - echo "Error: Parameter '$1' not a positive interger. Aborting." >&2 + echo "Error: Parameter '$1' not a positive integer. Aborting." >&2 exit 1 fi else diff --git a/tools/iio/Makefile b/tools/iio/Makefile index 3de763d9ab70..5d12ac4e7f8f 100644 --- a/tools/iio/Makefile +++ b/tools/iio/Makefile @@ -27,6 +27,7 @@ include $(srctree)/tools/build/Makefile.include # $(OUTPUT)include/linux/iio: ../../include/uapi/linux/iio mkdir -p $(OUTPUT)include/linux/iio 2>&1 || true + ln -sf $(CURDIR)/../../include/uapi/linux/iio/buffer.h $@ ln -sf $(CURDIR)/../../include/uapi/linux/iio/events.h $@ ln -sf $(CURDIR)/../../include/uapi/linux/iio/types.h $@ diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index bb03859db89d..0076437f6e3f 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -14,6 +14,7 @@ #include <unistd.h> #include <stdlib.h> +#include <dirent.h> #include <stdbool.h> #include <stdio.h> #include <errno.h> @@ -280,22 +281,69 @@ static void print_event(struct iio_event_data *event) printf("\n"); } +/* Enable or disable events in sysfs if the knob is available */ +static void enable_events(char *dev_dir, int enable) +{ + const struct dirent *ent; + char evdir[256]; + int ret; + DIR *dp; + + snprintf(evdir, sizeof(evdir), FORMAT_EVENTS_DIR, dev_dir); + evdir[sizeof(evdir)-1] = '\0'; + + dp = opendir(evdir); + if (!dp) { + fprintf(stderr, "Enabling/disabling events: can't open %s\n", + evdir); + return; + } + + while (ent = readdir(dp), ent) { + if (iioutils_check_suffix(ent->d_name, "_en")) { + printf("%sabling: %s\n", + enable ? "En" : "Dis", + ent->d_name); + ret = write_sysfs_int(ent->d_name, evdir, + enable); + if (ret < 0) + fprintf(stderr, "Failed to enable/disable %s\n", + ent->d_name); + } + } + + if (closedir(dp) == -1) { + perror("Enabling/disabling channels: " + "Failed to close directory"); + return; + } +} + int main(int argc, char **argv) { struct iio_event_data event; const char *device_name; + char *dev_dir_name = NULL; char *chrdev_name; int ret; int dev_num; int fd, event_fd; - - if (argc <= 1) { - fprintf(stderr, "Usage: %s <device_name>\n", argv[0]); + bool all_events = false; + + if (argc == 2) { + device_name = argv[1]; + } else if (argc == 3) { + device_name = argv[2]; + if (!strcmp(argv[1], "-a")) + all_events = true; + } else { + fprintf(stderr, + "Usage: iio_event_monitor [options] <device_name>\n" + "Listen and display events from IIO devices\n" + " -a Auto-activate all available events\n"); return -1; } - device_name = argv[1]; - dev_num = find_type_by_name(device_name, "iio:device"); if (dev_num >= 0) { printf("Found IIO device with name %s with device number %d\n", @@ -303,6 +351,10 @@ int main(int argc, char **argv) ret = asprintf(&chrdev_name, "/dev/iio:device%d", dev_num); if (ret < 0) return -ENOMEM; + /* Look up sysfs dir as well if we can */ + ret = asprintf(&dev_dir_name, "%siio:device%d", iio_dir, dev_num); + if (ret < 0) + return -ENOMEM; } else { /* * If we can't find an IIO device by name assume device_name is @@ -313,6 +365,9 @@ int main(int argc, char **argv) return -ENOMEM; } + if (all_events && dev_dir_name) + enable_events(dev_dir_name, 1); + fd = open(chrdev_name, 0); if (fd == -1) { ret = -errno; @@ -365,6 +420,10 @@ int main(int argc, char **argv) perror("Failed to close event file"); error_free_chrdev_name: + /* Disable events after use */ + if (all_events && dev_dir_name) + enable_events(dev_dir_name, 0); + free(chrdev_name); return ret; diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c index 34d63bcebcd2..2491c54a5e4f 100644 --- a/tools/iio/iio_generic_buffer.c +++ b/tools/iio/iio_generic_buffer.c @@ -30,6 +30,8 @@ #include <inttypes.h> #include <stdbool.h> #include <signal.h> +#include <sys/ioctl.h> +#include <linux/iio/buffer.h> #include "iio_utils.h" /** @@ -49,7 +51,7 @@ enum autochan { * Has the side effect of filling the channels[i].location values used * in processing the buffer output. **/ -int size_from_channelarray(struct iio_channel_info *channels, int num_channels) +static int size_from_channelarray(struct iio_channel_info *channels, int num_channels) { int bytes = 0; int i = 0; @@ -68,7 +70,7 @@ int size_from_channelarray(struct iio_channel_info *channels, int num_channels) return bytes; } -void print1byte(uint8_t input, struct iio_channel_info *info) +static void print1byte(uint8_t input, struct iio_channel_info *info) { /* * Shift before conversion to avoid sign extension @@ -85,7 +87,7 @@ void print1byte(uint8_t input, struct iio_channel_info *info) } } -void print2byte(uint16_t input, struct iio_channel_info *info) +static void print2byte(uint16_t input, struct iio_channel_info *info) { /* First swap if incorrect endian */ if (info->be) @@ -108,7 +110,7 @@ void print2byte(uint16_t input, struct iio_channel_info *info) } } -void print4byte(uint32_t input, struct iio_channel_info *info) +static void print4byte(uint32_t input, struct iio_channel_info *info) { /* First swap if incorrect endian */ if (info->be) @@ -131,7 +133,7 @@ void print4byte(uint32_t input, struct iio_channel_info *info) } } -void print8byte(uint64_t input, struct iio_channel_info *info) +static void print8byte(uint64_t input, struct iio_channel_info *info) { /* First swap if incorrect endian */ if (info->be) @@ -167,9 +169,8 @@ void print8byte(uint64_t input, struct iio_channel_info *info) * to fill the location offsets. * @num_channels: number of channels **/ -void process_scan(char *data, - struct iio_channel_info *channels, - int num_channels) +static void process_scan(char *data, struct iio_channel_info *channels, + int num_channels) { int k; @@ -198,7 +199,7 @@ void process_scan(char *data, printf("\n"); } -static int enable_disable_all_channels(char *dev_dir_name, int enable) +static int enable_disable_all_channels(char *dev_dir_name, int buffer_idx, int enable) { const struct dirent *ent; char scanelemdir[256]; @@ -206,7 +207,7 @@ static int enable_disable_all_channels(char *dev_dir_name, int enable) int ret; snprintf(scanelemdir, sizeof(scanelemdir), - FORMAT_SCAN_ELEMENTS_DIR, dev_dir_name); + FORMAT_SCAN_ELEMENTS_DIR, dev_dir_name, buffer_idx); scanelemdir[sizeof(scanelemdir)-1] = '\0'; dp = opendir(scanelemdir); @@ -238,12 +239,13 @@ static int enable_disable_all_channels(char *dev_dir_name, int enable) return 0; } -void print_usage(void) +static void print_usage(void) { fprintf(stderr, "Usage: generic_buffer [options]...\n" "Capture, convert and output data from IIO device buffer\n" " -a Auto-activate all available channels\n" " -A Force-activate ALL channels\n" + " -b <n> The buffer which to open (by index), default 0\n" " -c <n> Do n conversions, or loop forever if n < 0\n" " -e Disable wait for event (new data)\n" " -g Use trigger-less mode\n" @@ -257,12 +259,13 @@ void print_usage(void) " -w <n> Set delay between reads in us (event-less mode)\n"); } -enum autochan autochannels = AUTOCHANNELS_DISABLED; -char *dev_dir_name = NULL; -char *buf_dir_name = NULL; -bool current_trigger_set = false; +static enum autochan autochannels = AUTOCHANNELS_DISABLED; +static char *dev_dir_name = NULL; +static char *buf_dir_name = NULL; +static int buffer_idx = 0; +static bool current_trigger_set = false; -void cleanup(void) +static void cleanup(void) { int ret; @@ -287,21 +290,21 @@ void cleanup(void) /* Disable channels if auto-enabled */ if (dev_dir_name && autochannels == AUTOCHANNELS_ACTIVE) { - ret = enable_disable_all_channels(dev_dir_name, 0); + ret = enable_disable_all_channels(dev_dir_name, buffer_idx, 0); if (ret) fprintf(stderr, "Failed to disable all channels\n"); autochannels = AUTOCHANNELS_DISABLED; } } -void sig_handler(int signum) +static void sig_handler(int signum) { fprintf(stderr, "Caught signal %d\n", signum); cleanup(); exit(-signum); } -void register_cleanup(void) +static void register_cleanup(void) { struct sigaction sa = { .sa_handler = sig_handler }; const int signums[] = { SIGINT, SIGTERM, SIGABRT }; @@ -334,7 +337,9 @@ int main(int argc, char **argv) unsigned long long j; unsigned long toread; int ret, c; - int fp = -1; + struct stat st; + int fd = -1; + int buf_fd = -1; int num_channels = 0; char *trigger_name = NULL, *device_name = NULL; @@ -353,7 +358,7 @@ int main(int argc, char **argv) register_cleanup(); - while ((c = getopt_long(argc, argv, "aAc:egl:n:N:t:T:w:?", longopts, + while ((c = getopt_long(argc, argv, "aAb:c:egl:n:N:t:T:w:?", longopts, NULL)) != -1) { switch (c) { case 'a': @@ -362,7 +367,20 @@ int main(int argc, char **argv) case 'A': autochannels = AUTOCHANNELS_ENABLED; force_autochannels = true; - break; + break; + case 'b': + errno = 0; + buffer_idx = strtoll(optarg, &dummy, 10); + if (errno) { + ret = -errno; + goto error; + } + if (buffer_idx < 0) { + ret = -ERANGE; + goto error; + } + + break; case 'c': errno = 0; num_loops = strtoll(optarg, &dummy, 10); @@ -519,7 +537,7 @@ int main(int argc, char **argv) * Parse the files in scan_elements to identify what channels are * present */ - ret = build_channel_array(dev_dir_name, &channels, &num_channels); + ret = build_channel_array(dev_dir_name, buffer_idx, &channels, &num_channels); if (ret) { fprintf(stderr, "Problem reading scan element information\n" "diag %s\n", dev_dir_name); @@ -536,7 +554,7 @@ int main(int argc, char **argv) (autochannels == AUTOCHANNELS_ENABLED && force_autochannels)) { fprintf(stderr, "Enabling all channels\n"); - ret = enable_disable_all_channels(dev_dir_name, 1); + ret = enable_disable_all_channels(dev_dir_name, buffer_idx, 1); if (ret) { fprintf(stderr, "Failed to enable all channels\n"); goto error; @@ -545,7 +563,7 @@ int main(int argc, char **argv) /* This flags that we need to disable the channels again */ autochannels = AUTOCHANNELS_ACTIVE; - ret = build_channel_array(dev_dir_name, &channels, + ret = build_channel_array(dev_dir_name, buffer_idx, &channels, &num_channels); if (ret) { fprintf(stderr, "Problem reading scan element " @@ -566,7 +584,7 @@ int main(int argc, char **argv) fprintf(stderr, "Enable channels manually in " FORMAT_SCAN_ELEMENTS_DIR "/*_en or pass -a to autoenable channels and " - "try again.\n", dev_dir_name); + "try again.\n", dev_dir_name, buffer_idx); ret = -ENOENT; goto error; } @@ -577,12 +595,25 @@ int main(int argc, char **argv) * be built rather than found. */ ret = asprintf(&buf_dir_name, - "%siio:device%d/buffer", iio_dir, dev_num); + "%siio:device%d/buffer%d", iio_dir, dev_num, buffer_idx); if (ret < 0) { ret = -ENOMEM; goto error; } + if (stat(buf_dir_name, &st)) { + fprintf(stderr, "Could not stat() '%s', got error %d: %s\n", + buf_dir_name, errno, strerror(errno)); + ret = -errno; + goto error; + } + + if (!S_ISDIR(st.st_mode)) { + fprintf(stderr, "File '%s' is not a directory\n", buf_dir_name); + ret = -EFAULT; + goto error; + } + if (!notrigger) { printf("%s %s\n", dev_dir_name, trigger_name); /* @@ -599,6 +630,35 @@ int main(int argc, char **argv) } } + ret = asprintf(&buffer_access, "/dev/iio:device%d", dev_num); + if (ret < 0) { + ret = -ENOMEM; + goto error; + } + + /* Attempt to open non blocking the access dev */ + fd = open(buffer_access, O_RDONLY | O_NONBLOCK); + if (fd == -1) { /* TODO: If it isn't there make the node */ + ret = -errno; + fprintf(stderr, "Failed to open %s\n", buffer_access); + goto error; + } + + /* specify for which buffer index we want an FD */ + buf_fd = buffer_idx; + + ret = ioctl(fd, IIO_BUFFER_GET_FD_IOCTL, &buf_fd); + if (ret == -1 || buf_fd == -1) { + ret = -errno; + if (ret == -ENODEV || ret == -EINVAL) + fprintf(stderr, + "Device does not have this many buffers\n"); + else + fprintf(stderr, "Failed to retrieve buffer fd\n"); + + goto error; + } + /* Setup ring buffer parameters */ ret = write_sysfs_int("length", buf_dir_name, buf_len); if (ret < 0) @@ -608,7 +668,8 @@ int main(int argc, char **argv) ret = write_sysfs_int("enable", buf_dir_name, 1); if (ret < 0) { fprintf(stderr, - "Failed to enable buffer: %s\n", strerror(-ret)); + "Failed to enable buffer '%s': %s\n", + buf_dir_name, strerror(-ret)); goto error; } @@ -619,24 +680,30 @@ int main(int argc, char **argv) goto error; } - ret = asprintf(&buffer_access, "/dev/iio:device%d", dev_num); - if (ret < 0) { - ret = -ENOMEM; - goto error; + /** + * This check is being done here for sanity reasons, however it + * should be omitted under normal operation. + * If this is buffer0, we check that we get EBUSY after this point. + */ + if (buffer_idx == 0) { + errno = 0; + read_size = read(fd, data, 1); + if (read_size > -1 || errno != EBUSY) { + ret = -EFAULT; + perror("Reading from '%s' should not be possible after ioctl()"); + goto error; + } } - /* Attempt to open non blocking the access dev */ - fp = open(buffer_access, O_RDONLY | O_NONBLOCK); - if (fp == -1) { /* TODO: If it isn't there make the node */ - ret = -errno; - fprintf(stderr, "Failed to open %s\n", buffer_access); - goto error; - } + /* close now the main chardev FD and let the buffer FD work */ + if (close(fd) == -1) + perror("Failed to close character device file"); + fd = -1; for (j = 0; j < num_loops || num_loops < 0; j++) { if (!noevents) { struct pollfd pfd = { - .fd = fp, + .fd = buf_fd, .events = POLLIN, }; @@ -654,7 +721,7 @@ int main(int argc, char **argv) toread = 64; } - read_size = read(fp, data, toread * scan_size); + read_size = read(buf_fd, data, toread * scan_size); if (read_size < 0) { if (errno == EAGAIN) { fprintf(stderr, "nothing available\n"); @@ -671,7 +738,9 @@ int main(int argc, char **argv) error: cleanup(); - if (fp >= 0 && close(fp) == -1) + if (fd >= 0 && close(fd) == -1) + perror("Failed to close character device"); + if (buf_fd >= 0 && close(buf_fd) == -1) perror("Failed to close buffer"); free(buffer_access); free(data); diff --git a/tools/iio/iio_utils.c b/tools/iio/iio_utils.c index 7399eb7f1378..aadee6d34c74 100644 --- a/tools/iio/iio_utils.c +++ b/tools/iio/iio_utils.c @@ -77,15 +77,17 @@ int iioutils_break_up_name(const char *full_name, char **generic_name) * @mask: output a bit mask for the raw data * @be: output if data in big endian * @device_dir: the IIO device directory + * @buffer_idx: the IIO buffer index * @name: the channel name * @generic_name: the channel type name * * Returns a value >= 0 on success, otherwise a negative error code. **/ -int iioutils_get_type(unsigned *is_signed, unsigned *bytes, unsigned *bits_used, - unsigned *shift, uint64_t *mask, unsigned *be, - const char *device_dir, const char *name, - const char *generic_name) +static int iioutils_get_type(unsigned int *is_signed, unsigned int *bytes, + unsigned int *bits_used, unsigned int *shift, + uint64_t *mask, unsigned int *be, + const char *device_dir, int buffer_idx, + const char *name, const char *generic_name) { FILE *sysfsfp; int ret; @@ -95,7 +97,7 @@ int iioutils_get_type(unsigned *is_signed, unsigned *bytes, unsigned *bits_used, unsigned padint; const struct dirent *ent; - ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir); + ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir, buffer_idx); if (ret < 0) return -ENOMEM; @@ -303,12 +305,13 @@ void bsort_channel_array_by_index(struct iio_channel_info *ci_array, int cnt) /** * build_channel_array() - function to figure out what channels are present * @device_dir: the IIO device directory in sysfs + * @buffer_idx: the IIO buffer for this channel array * @ci_array: output the resulting array of iio_channel_info * @counter: output the amount of array elements * * Returns 0 on success, otherwise a negative error code. **/ -int build_channel_array(const char *device_dir, +int build_channel_array(const char *device_dir, int buffer_idx, struct iio_channel_info **ci_array, int *counter) { DIR *dp; @@ -321,7 +324,7 @@ int build_channel_array(const char *device_dir, char *filename; *counter = 0; - ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir); + ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir, buffer_idx); if (ret < 0) return -ENOMEM; @@ -502,6 +505,7 @@ int build_channel_array(const char *device_dir, ¤t->mask, ¤t->be, device_dir, + buffer_idx, current->name, current->generic_name); if (ret < 0) diff --git a/tools/iio/iio_utils.h b/tools/iio/iio_utils.h index 74bde4fde2c8..663c94a6c705 100644 --- a/tools/iio/iio_utils.h +++ b/tools/iio/iio_utils.h @@ -12,7 +12,8 @@ /* Made up value to limit allocation sizes */ #define IIO_MAX_NAME_LENGTH 64 -#define FORMAT_SCAN_ELEMENTS_DIR "%s/scan_elements" +#define FORMAT_SCAN_ELEMENTS_DIR "%s/buffer%d" +#define FORMAT_EVENTS_DIR "%s/events" #define FORMAT_TYPE_FILE "%s_type" #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -57,15 +58,11 @@ static inline int iioutils_check_suffix(const char *str, const char *suffix) } int iioutils_break_up_name(const char *full_name, char **generic_name); -int iioutils_get_type(unsigned *is_signed, unsigned *bytes, unsigned *bits_used, - unsigned *shift, uint64_t *mask, unsigned *be, - const char *device_dir, const char *name, - const char *generic_name); int iioutils_get_param_float(float *output, const char *param_name, const char *device_dir, const char *name, const char *generic_name); void bsort_channel_array_by_index(struct iio_channel_info *ci_array, int cnt); -int build_channel_array(const char *device_dir, +int build_channel_array(const char *device_dir, int buffer_idx, struct iio_channel_info **ci_array, int *counter); int find_type_by_name(const char *name, const char *type); int write_sysfs_int(const char *filename, const char *basedir, int val); diff --git a/tools/include/linux/kconfig.h b/tools/include/linux/kconfig.h new file mode 100644 index 000000000000..1555a0c4f345 --- /dev/null +++ b/tools/include/linux/kconfig.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_KCONFIG_H +#define _TOOLS_LINUX_KCONFIG_H + +/* CONFIG_CC_VERSION_TEXT (Do not delete this comment. See help in Kconfig) */ + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define __BIG_ENDIAN 4321 +#else +#define __LITTLE_ENDIAN 1234 +#endif + +#define __ARG_PLACEHOLDER_1 0, +#define __take_second_arg(__ignored, val, ...) val + +/* + * The use of "&&" / "||" is limited in certain expressions. + * The following enable to calculate "and" / "or" with macro expansion only. + */ +#define __and(x, y) ___and(x, y) +#define ___and(x, y) ____and(__ARG_PLACEHOLDER_##x, y) +#define ____and(arg1_or_junk, y) __take_second_arg(arg1_or_junk y, 0) + +#define __or(x, y) ___or(x, y) +#define ___or(x, y) ____or(__ARG_PLACEHOLDER_##x, y) +#define ____or(arg1_or_junk, y) __take_second_arg(arg1_or_junk 1, y) + +/* + * Helper macros to use CONFIG_ options in C/CPP expressions. Note that + * these only work with boolean and tristate options. + */ + +/* + * Getting something that works in C and CPP for an arg that may or may + * not be defined is tricky. Here, if we have "#define CONFIG_BOOGER 1" + * we match on the placeholder define, insert the "0," for arg1 and generate + * the triplet (0, 1, 0). Then the last step cherry picks the 2nd arg (a one). + * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when + * the last step cherry picks the 2nd arg, we get a zero. + */ +#define __is_defined(x) ___is_defined(x) +#define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) +#define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) + +/* + * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 + * otherwise. For boolean options, this is equivalent to + * IS_ENABLED(CONFIG_FOO). + */ +#define IS_BUILTIN(option) __is_defined(option) + +/* + * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 + * otherwise. + */ +#define IS_MODULE(option) __is_defined(option##_MODULE) + +/* + * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled + * code can call a function defined in code compiled based on CONFIG_FOO. + * This is similar to IS_ENABLED(), but returns false when invoked from + * built-in code when CONFIG_FOO is set to 'm'. + */ +#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \ + __and(IS_MODULE(option), __is_defined(MODULE))) + +/* + * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', + * 0 otherwise. + */ +#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option)) + +#endif /* _TOOLS_LINUX_KCONFIG_H */ diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index ae5662d368b9..5a00b8b2cf9f 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -58,11 +58,25 @@ struct static_call_site { __raw_static_call(name); \ }) +struct static_call_key { + void *func; + union { + /* bit 0: 0 = mods, 1 = sites */ + unsigned long type; + struct static_call_mod *mods; + struct static_call_site *sites; + }; +}; + #else /* !CONFIG_HAVE_STATIC_CALL_INLINE */ #define __STATIC_CALL_ADDRESSABLE(name) #define __static_call(name) __raw_static_call(name) +struct static_call_key { + void *func; +}; + #endif /* CONFIG_HAVE_STATIC_CALL_INLINE */ #ifdef MODULE @@ -77,6 +91,10 @@ struct static_call_site { #else +struct static_call_key { + void *func; +}; + #define static_call(name) \ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) diff --git a/tools/include/uapi/asm/errno.h b/tools/include/uapi/asm/errno.h index 637189ec1ab9..d30439b4b8ab 100644 --- a/tools/include/uapi/asm/errno.h +++ b/tools/include/uapi/asm/errno.h @@ -9,8 +9,6 @@ #include "../../../arch/alpha/include/uapi/asm/errno.h" #elif defined(__mips__) #include "../../../arch/mips/include/uapi/asm/errno.h" -#elif defined(__ia64__) -#include "../../../arch/ia64/include/uapi/asm/errno.h" #elif defined(__xtensa__) #include "../../../arch/xtensa/include/uapi/asm/errno.h" #else diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -93,7 +93,738 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, @@ -247,6 +978,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -393,11 +1125,24 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { @@ -720,7 +1465,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 @@ -1765,6 +2510,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -3333,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -3350,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -3360,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -3850,7 +4615,7 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based + * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * @@ -3867,6 +4632,14 @@ union bpf_attr { * against the current net device. This is practical if this isn't * used prior to redirect. * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () @@ -3891,11 +4664,9 @@ union bpf_attr { * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, - * which is returned here and XDP and TX length operate at L2. + * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using - * MTU value in your BPF-code. On input *mtu_len* must be a valid - * pointer and be initialized (to zero), else verifier will reject - * BPF program. + * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. @@ -3909,6 +4680,61 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4901,8 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4168,6 +4996,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { @@ -4615,6 +5444,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; @@ -5205,7 +6036,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 8b281f722e5b..f6afee209620 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1154,6 +1154,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) struct kvm_xen_hvm_config { __u32 flags; @@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr { union { __u64 gpa; __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; } u; }; /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 /* Secure Encrypted Virtualization command */ enum sev_cmd_id { diff --git a/tools/kvm/kvm_stat/kvm_stat.service b/tools/kvm/kvm_stat/kvm_stat.service index 71aabaffe779..8f13b843d5b4 100644 --- a/tools/kvm/kvm_stat/kvm_stat.service +++ b/tools/kvm/kvm_stat/kvm_stat.service @@ -9,6 +9,7 @@ Type=simple ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv ExecReload=/bin/kill -HUP $MAINPID Restart=always +RestartSec=60s SyslogIdentifier=kvm_stat SyslogLevel=debug diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 190366d05588..9b057cc7650a 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o ringbuf.o + btf_dump.o ringbuf.o strset.o linker.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..e43e1896cb4b 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) @@ -215,7 +215,7 @@ define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ fi; \ - $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2' + $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2' endef install_lib: all_cmd @@ -228,7 +228,6 @@ install_headers: $(BPF_HELPER_DEFS) $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ $(call do_install,btf.h,$(prefix)/include/bpf,644); \ - $(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \ diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 53b3e199fb25..09ebe3db5f2f 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -88,11 +88,19 @@ enum bpf_enum_value_kind { const void *p = (const void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ unsigned long long val; \ \ + /* This is a so-called barrier_var() operation that makes specified \ + * variable "a black box" for optimizing compiler. \ + * It forces compiler to perform BYTE_OFFSET relocation on p and use \ + * its calculated value in the switch below, instead of applying \ + * the same relocation 4 times for each individual memory load. \ + */ \ + asm volatile("" : "=r"(p) : "0"(p)); \ + \ switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ - case 1: val = *(const unsigned char *)p; \ - case 2: val = *(const unsigned short *)p; \ - case 4: val = *(const unsigned int *)p; \ - case 8: val = *(const unsigned long long *)p; \ + case 1: val = *(const unsigned char *)p; break; \ + case 2: val = *(const unsigned short *)p; break; \ + case 4: val = *(const unsigned int *)p; break; \ + case 8: val = *(const unsigned long long *)p; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index ae6c975e0b87..9720dc0b4605 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -25,13 +25,21 @@ /* * Helper macro to place programs, maps, license in * different sections in elf_bpf file. Section names - * are interpreted by elf_bpf loader + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. */ -#define SEC(NAME) __attribute__((section(NAME), used)) +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ -#ifndef __always_inline +/* Avoid 'linux/stddef.h' definition of '__always_inline'. */ +#undef __always_inline #define __always_inline inline __attribute__((always_inline)) -#endif + #ifndef __noinline #define __noinline __attribute__((noinline)) #endif @@ -40,7 +48,29 @@ #endif /* - * Helper macro to manipulate data structures + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + +/* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include + * any system-level headers (such as stddef.h, linux/version.h, etc), and + * commonly-used macros like NULL and KERNEL_VERSION aren't available through + * vmlinux.h. This just adds unnecessary hurdles and forces users to re-define + * them on their own. So as a convenience, provide such definitions here. + */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) +#endif + +/* + * Helper macros to manipulate data structures */ #ifndef offsetof #define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index f9ef37707888..8c954ebc0c7c 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -413,20 +413,56 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + /* * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values * in a structure. */ -#define BPF_SEQ_PRINTF(seq, fmt, args...) \ - ({ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[] = { args }; \ - _Pragma("GCC diagnostic pop") \ - int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ - ___param, sizeof(___param)); \ - ___ret; \ - }) +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d9c10830d749..d57e13a13798 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -21,6 +21,7 @@ #include "libbpf.h" #include "libbpf_internal.h" #include "hashmap.h" +#include "strset.h" #define BTF_MAX_NR_TYPES 0x7fffffffU #define BTF_MAX_STR_OFFSET 0x7fffffffU @@ -67,7 +68,7 @@ struct btf { * | | | * hdr | | * types_data----+ | - * strs_data------------------+ + * strset__data(strs_set)-----+ * * +----------+---------+-----------+ * | Header | Types | Strings | @@ -105,20 +106,15 @@ struct btf { */ int start_str_off; + /* only one of strs_data or strs_set can be non-NULL, depending on + * whether BTF is in a modifiable state (strs_set is used) or not + * (strs_data points inside raw_data) + */ void *strs_data; - size_t strs_data_cap; /* used size stored in hdr->str_len */ - - /* lookup index for each unique string in strings section */ - struct hashmap *strs_hash; + /* a set of unique strings */ + struct strset *strs_set; /* whether strings are already deduplicated */ bool strs_deduped; - /* extra indirection layer to make strings hashmap work with stable - * string offsets and ability to transparently choose between - * btf->strs_data or btf_dedup->strs_data as a source of strings. - * This is used for BTF strings dedup to transfer deduplicated strings - * data back to struct btf without re-building strings index. - */ - void **strs_data_ptr; /* BTF object FD, if loaded into kernel */ int fd; @@ -142,8 +138,8 @@ static inline __u64 ptr_to_u64(const void *ptr) * On success, memory pointer to the beginning of unused memory is returned. * On error, NULL is returned. */ -void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, - size_t cur_cnt, size_t max_cnt, size_t add_cnt) +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt) { size_t new_cnt; void *new_data; @@ -179,14 +175,14 @@ void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, /* Ensure given dynamically allocated memory region has enough allocated space * to accommodate *need_cnt* elements of size *elem_sz* bytes each */ -int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) { void *p; if (need_cnt <= *cap_cnt) return 0; - p = btf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); + p = libbpf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); if (!p) return -ENOMEM; @@ -197,8 +193,8 @@ static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) { __u32 *p; - p = btf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), - btf->nr_types, BTF_MAX_NR_TYPES, 1); + p = libbpf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), + btf->nr_types, BTF_MAX_NR_TYPES, 1); if (!p) return -ENOMEM; @@ -291,6 +287,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); @@ -338,6 +335,7 @@ static int btf_bswap_type_rest(struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return 0; case BTF_KIND_INT: *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); @@ -433,7 +431,7 @@ const struct btf *btf__base_btf(const struct btf *btf) } /* internal helper returning non-const pointer to a type */ -static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) +struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) { if (type_id == 0) return &btf_void; @@ -578,6 +576,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: size = t->size; goto done; case BTF_KIND_PTR: @@ -621,6 +620,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: return btf_ptr_sz(btf); @@ -734,7 +734,7 @@ void btf__free(struct btf *btf) */ free(btf->hdr); free(btf->types_data); - free(btf->strs_data); + strset__free(btf->strs_set); } free(btf->raw_data); free(btf->raw_data_swapped); @@ -1242,6 +1242,11 @@ void btf__set_fd(struct btf *btf, int fd) btf->fd = fd; } +static const void *btf_strs_data(const struct btf *btf) +{ + return btf->strs_data ? btf->strs_data : strset__data(btf->strs_set); +} + static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian) { struct btf_header *hdr = btf->hdr; @@ -1282,7 +1287,7 @@ static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endi } p += hdr->type_len; - memcpy(p, btf->strs_data, hdr->str_len); + memcpy(p, btf_strs_data(btf), hdr->str_len); p += hdr->str_len; *size = data_sz; @@ -1316,7 +1321,7 @@ const char *btf__str_by_offset(const struct btf *btf, __u32 offset) if (offset < btf->start_str_off) return btf__str_by_offset(btf->base_btf, offset); else if (offset - btf->start_str_off < btf->hdr->str_len) - return btf->strs_data + (offset - btf->start_str_off); + return btf_strs_data(btf) + (offset - btf->start_str_off); else return NULL; } @@ -1470,25 +1475,6 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, return 0; } -static size_t strs_hash_fn(const void *key, void *ctx) -{ - const struct btf *btf = ctx; - const char *strs = *btf->strs_data_ptr; - const char *str = strs + (long)key; - - return str_hash(str); -} - -static bool strs_hash_equal_fn(const void *key1, const void *key2, void *ctx) -{ - const struct btf *btf = ctx; - const char *strs = *btf->strs_data_ptr; - const char *str1 = strs + (long)key1; - const char *str2 = strs + (long)key2; - - return strcmp(str1, str2) == 0; -} - static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { @@ -1507,10 +1493,9 @@ static void btf_invalidate_raw_data(struct btf *btf) */ static int btf_ensure_modifiable(struct btf *btf) { - void *hdr, *types, *strs, *strs_end, *s; - struct hashmap *hash = NULL; - long off; - int err; + void *hdr, *types; + struct strset *set = NULL; + int err = -ENOMEM; if (btf_is_modifiable(btf)) { /* any BTF modification invalidates raw_data */ @@ -1521,44 +1506,25 @@ static int btf_ensure_modifiable(struct btf *btf) /* split raw data into three memory regions */ hdr = malloc(btf->hdr->hdr_len); types = malloc(btf->hdr->type_len); - strs = malloc(btf->hdr->str_len); - if (!hdr || !types || !strs) + if (!hdr || !types) goto err_out; memcpy(hdr, btf->hdr, btf->hdr->hdr_len); memcpy(types, btf->types_data, btf->hdr->type_len); - memcpy(strs, btf->strs_data, btf->hdr->str_len); - - /* make hashmap below use btf->strs_data as a source of strings */ - btf->strs_data_ptr = &btf->strs_data; /* build lookup index for all strings */ - hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, btf); - if (IS_ERR(hash)) { - err = PTR_ERR(hash); - hash = NULL; + set = strset__new(BTF_MAX_STR_OFFSET, btf->strs_data, btf->hdr->str_len); + if (IS_ERR(set)) { + err = PTR_ERR(set); goto err_out; } - strs_end = strs + btf->hdr->str_len; - for (off = 0, s = strs; s < strs_end; off += strlen(s) + 1, s = strs + off) { - /* hashmap__add() returns EEXIST if string with the same - * content already is in the hash map - */ - err = hashmap__add(hash, (void *)off, (void *)off); - if (err == -EEXIST) - continue; /* duplicate */ - if (err) - goto err_out; - } - /* only when everything was successful, update internal state */ btf->hdr = hdr; btf->types_data = types; btf->types_data_cap = btf->hdr->type_len; - btf->strs_data = strs; - btf->strs_data_cap = btf->hdr->str_len; - btf->strs_hash = hash; + btf->strs_data = NULL; + btf->strs_set = set; /* if BTF was created from scratch, all strings are guaranteed to be * unique and deduplicated */ @@ -1573,17 +1539,10 @@ static int btf_ensure_modifiable(struct btf *btf) return 0; err_out: - hashmap__free(hash); + strset__free(set); free(hdr); free(types); - free(strs); - return -ENOMEM; -} - -static void *btf_add_str_mem(struct btf *btf, size_t add_sz) -{ - return btf_add_mem(&btf->strs_data, &btf->strs_data_cap, 1, - btf->hdr->str_len, BTF_MAX_STR_OFFSET, add_sz); + return err; } /* Find an offset in BTF string section that corresponds to a given string *s*. @@ -1594,34 +1553,23 @@ static void *btf_add_str_mem(struct btf *btf, size_t add_sz) */ int btf__find_str(struct btf *btf, const char *s) { - long old_off, new_off, len; - void *p; + int off; if (btf->base_btf) { - int ret; - - ret = btf__find_str(btf->base_btf, s); - if (ret != -ENOENT) - return ret; + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; } /* BTF needs to be in a modifiable state to build string lookup index */ if (btf_ensure_modifiable(btf)) return -ENOMEM; - /* see btf__add_str() for why we do this */ - len = strlen(s) + 1; - p = btf_add_str_mem(btf, len); - if (!p) - return -ENOMEM; - - new_off = btf->hdr->str_len; - memcpy(p, s, len); - - if (hashmap__find(btf->strs_hash, (void *)new_off, (void **)&old_off)) - return btf->start_str_off + old_off; + off = strset__find_str(btf->strs_set, s); + if (off < 0) + return off; - return -ENOENT; + return btf->start_str_off + off; } /* Add a string s to the BTF string section. @@ -1631,61 +1579,30 @@ int btf__find_str(struct btf *btf, const char *s) */ int btf__add_str(struct btf *btf, const char *s) { - long old_off, new_off, len; - void *p; - int err; + int off; if (btf->base_btf) { - int ret; - - ret = btf__find_str(btf->base_btf, s); - if (ret != -ENOENT) - return ret; + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; } if (btf_ensure_modifiable(btf)) return -ENOMEM; - /* Hashmap keys are always offsets within btf->strs_data, so to even - * look up some string from the "outside", we need to first append it - * at the end, so that it can be addressed with an offset. Luckily, - * until btf->hdr->str_len is incremented, that string is just a piece - * of garbage for the rest of BTF code, so no harm, no foul. On the - * other hand, if the string is unique, it's already appended and - * ready to be used, only a simple btf->hdr->str_len increment away. - */ - len = strlen(s) + 1; - p = btf_add_str_mem(btf, len); - if (!p) - return -ENOMEM; - - new_off = btf->hdr->str_len; - memcpy(p, s, len); + off = strset__add_str(btf->strs_set, s); + if (off < 0) + return off; - /* Now attempt to add the string, but only if the string with the same - * contents doesn't exist already (HASHMAP_ADD strategy). If such - * string exists, we'll get its offset in old_off (that's old_key). - */ - err = hashmap__insert(btf->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); - if (err == -EEXIST) - return btf->start_str_off + old_off; /* duplicated string, return existing offset */ - if (err) - return err; + btf->hdr->str_len = strset__data_size(btf->strs_set); - btf->hdr->str_len += len; /* new unique string, adjust data length */ - return btf->start_str_off + new_off; + return btf->start_str_off + off; } static void *btf_add_type_mem(struct btf *btf, size_t add_sz) { - return btf_add_mem(&btf->types_data, &btf->types_data_cap, 1, - btf->hdr->type_len, UINT_MAX, add_sz); -} - -static __u32 btf_type_info(int kind, int vlen, int kflag) -{ - return (kflag << 31) | (kind << 24) | vlen; + return libbpf_add_mem(&btf->types_data, &btf->types_data_cap, 1, + btf->hdr->type_len, UINT_MAX, add_sz); } static void btf_type_inc_vlen(struct btf_type *t) @@ -1707,6 +1624,54 @@ static int btf_commit_type(struct btf *btf, int data_sz) return btf->start_id + btf->nr_types - 1; } +struct btf_pipe { + const struct btf *src; + struct btf *dst; +}; + +static int btf_rewrite_str(__u32 *str_off, void *ctx) +{ + struct btf_pipe *p = ctx; + int off; + + if (!*str_off) /* nothing to do for empty strings */ + return 0; + + off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off)); + if (off < 0) + return off; + + *str_off = off; + return 0; +} + +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_type *t; + int sz, err; + + sz = btf_type_size(src_type); + if (sz < 0) + return sz; + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + memcpy(t, src_type, sz); + + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + return err; + + return btf_commit_type(btf, sz); +} + /* * Append new BTF_KIND_INT type with: * - *name* - non-empty, non-NULL type name; @@ -1756,6 +1721,47 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding return btf_commit_type(btf, sz); } +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return -EINVAL; + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return -EINVAL; + + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + /* it's completely legal to append BTF types with type IDs pointing forward to * types that haven't been appended yet, so we only make sure that id looks * sane, we can't guarantee that ID will always be valid @@ -1883,7 +1889,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 * - *byte_sz* - size of the struct, in bytes; * * Struct initially has no fields in it. Fields can be added by - * btf__add_field() right after btf__add_struct() succeeds. + * btf__add_field() right after btf__add_struct() succeeds. * * Returns: * - >0, type ID of newly added BTF type; @@ -2971,10 +2977,7 @@ struct btf_dedup { /* Various option modifying behavior of algorithm */ struct btf_dedup_opts opts; /* temporary strings deduplication state */ - void *strs_data; - size_t strs_cap; - size_t strs_len; - struct hashmap* strs_hash; + struct strset *strs_set; }; static long hash_combine(long h, long value) @@ -3110,95 +3113,28 @@ done: return d; } -typedef int (*str_off_fn_t)(__u32 *str_off_ptr, void *ctx); - /* * Iterate over all possible places in .BTF and .BTF.ext that can reference * string and pass pointer to it to a provided callback `fn`. */ -static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) +static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void *ctx) { - void *line_data_cur, *line_data_end; - int i, j, r, rec_size; - struct btf_type *t; + int i, r; for (i = 0; i < d->btf->nr_types; i++) { - t = btf_type_by_id(d->btf, d->btf->start_id + i); - r = fn(&t->name_off, ctx); + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_str_offs(t, fn, ctx); if (r) return r; - - switch (btf_kind(t)) { - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *m = btf_members(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - case BTF_KIND_ENUM: { - struct btf_enum *m = btf_enum(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - case BTF_KIND_FUNC_PROTO: { - struct btf_param *m = btf_params(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - default: - break; - } } if (!d->btf_ext) return 0; - line_data_cur = d->btf_ext->line_info.info; - line_data_end = d->btf_ext->line_info.info + d->btf_ext->line_info.len; - rec_size = d->btf_ext->line_info.rec_size; - - while (line_data_cur < line_data_end) { - struct btf_ext_info_sec *sec = line_data_cur; - struct bpf_line_info_min *line_info; - __u32 num_info = sec->num_info; - - r = fn(&sec->sec_name_off, ctx); - if (r) - return r; - - line_data_cur += sizeof(struct btf_ext_info_sec); - for (i = 0; i < num_info; i++) { - line_info = line_data_cur; - r = fn(&line_info->file_name_off, ctx); - if (r) - return r; - r = fn(&line_info->line_off, ctx); - if (r) - return r; - line_data_cur += rec_size; - } - } + r = btf_ext_visit_str_offs(d->btf_ext, fn, ctx); + if (r) + return r; return 0; } @@ -3207,10 +3143,8 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) { struct btf_dedup *d = ctx; __u32 str_off = *str_off_ptr; - long old_off, new_off, len; const char *s; - void *p; - int err; + int off, err; /* don't touch empty string or string in main BTF */ if (str_off == 0 || str_off < d->btf->start_str_off) @@ -3227,29 +3161,11 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) return err; } - len = strlen(s) + 1; - - new_off = d->strs_len; - p = btf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); - if (!p) - return -ENOMEM; + off = strset__add_str(d->strs_set, s); + if (off < 0) + return off; - memcpy(p, s, len); - - /* Now attempt to add the string, but only if the string with the same - * contents doesn't exist already (HASHMAP_ADD strategy). If such - * string exists, we'll get its offset in old_off (that's old_key). - */ - err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); - if (err == -EEXIST) { - *str_off_ptr = d->btf->start_str_off + old_off; - } else if (err) { - return err; - } else { - *str_off_ptr = d->btf->start_str_off + new_off; - d->strs_len += len; - } + *str_off_ptr = d->btf->start_str_off + off; return 0; } @@ -3266,39 +3182,23 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) */ static int btf_dedup_strings(struct btf_dedup *d) { - char *s; int err; if (d->btf->strs_deduped) return 0; - /* temporarily switch to use btf_dedup's strs_data for strings for hash - * functions; later we'll just transfer hashmap to struct btf as is, - * along the strs_data - */ - d->btf->strs_data_ptr = &d->strs_data; - - d->strs_hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, d->btf); - if (IS_ERR(d->strs_hash)) { - err = PTR_ERR(d->strs_hash); - d->strs_hash = NULL; + d->strs_set = strset__new(BTF_MAX_STR_OFFSET, NULL, 0); + if (IS_ERR(d->strs_set)) { + err = PTR_ERR(d->strs_set); goto err_out; } if (!d->btf->base_btf) { - s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); - if (!s) - return -ENOMEM; - /* initial empty string */ - s[0] = 0; - d->strs_len = 1; - /* insert empty string; we won't be looking it up during strings * dedup, but it's good to have it for generic BTF string lookups */ - err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, - HASHMAP_ADD, NULL, NULL); - if (err) + err = strset__add_str(d->strs_set, ""); + if (err < 0) goto err_out; } @@ -3308,28 +3208,16 @@ static int btf_dedup_strings(struct btf_dedup *d) goto err_out; /* replace BTF string data and hash with deduped ones */ - free(d->btf->strs_data); - hashmap__free(d->btf->strs_hash); - d->btf->strs_data = d->strs_data; - d->btf->strs_data_cap = d->strs_cap; - d->btf->hdr->str_len = d->strs_len; - d->btf->strs_hash = d->strs_hash; - /* now point strs_data_ptr back to btf->strs_data */ - d->btf->strs_data_ptr = &d->btf->strs_data; - - d->strs_data = d->strs_hash = NULL; - d->strs_len = d->strs_cap = 0; + strset__free(d->btf->strs_set); + d->btf->hdr->str_len = strset__data_size(d->strs_set); + d->btf->strs_set = d->strs_set; + d->strs_set = NULL; d->btf->strs_deduped = true; return 0; err_out: - free(d->strs_data); - hashmap__free(d->strs_hash); - d->strs_data = d->strs_hash = NULL; - d->strs_len = d->strs_cap = 0; - - /* restore strings pointer for existing d->btf->strs_hash back */ - d->btf->strs_data_ptr = &d->strs_data; + strset__free(d->strs_set); + d->strs_set = NULL; return err; } @@ -3626,6 +3514,7 @@ static int btf_dedup_prep(struct btf_dedup *d) case BTF_KIND_FWD: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: h = btf_hash_common(t); break; case BTF_KIND_INT: @@ -3722,6 +3611,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) break; case BTF_KIND_FWD: + case BTF_KIND_FLOAT: h = btf_hash_common(t); for_each_dedup_cand(d, hash_entry, h) { cand_id = (__u32)(long)hash_entry->value; @@ -3983,6 +3873,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return btf_compat_enum(cand_type, canon_type); case BTF_KIND_FWD: + case BTF_KIND_FLOAT: return btf_equal_common(cand_type, canon_type); case BTF_KIND_CONST: @@ -4450,15 +4341,18 @@ static int btf_dedup_compact_types(struct btf_dedup *d) * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, * which is populated during compaction phase. */ -static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) +static int btf_dedup_remap_type_id(__u32 *type_id, void *ctx) { + struct btf_dedup *d = ctx; __u32 resolved_type_id, new_type_id; - resolved_type_id = resolve_type_id(d, type_id); + resolved_type_id = resolve_type_id(d, *type_id); new_type_id = d->hypot_map[resolved_type_id]; if (new_type_id > BTF_MAX_NR_TYPES) return -EINVAL; - return new_type_id; + + *type_id = new_type_id; + return 0; } /* @@ -4471,111 +4365,28 @@ static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) * referenced from any BTF type (e.g., struct fields, func proto args, etc) to * their final deduped type IDs. */ -static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) +static int btf_dedup_remap_types(struct btf_dedup *d) { - struct btf_type *t = btf_type_by_id(d->btf, type_id); int i, r; - switch (btf_kind(t)) { - case BTF_KIND_INT: - case BTF_KIND_ENUM: - break; - - case BTF_KIND_FWD: - case BTF_KIND_CONST: - case BTF_KIND_VOLATILE: - case BTF_KIND_RESTRICT: - case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: - case BTF_KIND_FUNC: - case BTF_KIND_VAR: - r = btf_dedup_remap_type_id(d, t->type); - if (r < 0) - return r; - t->type = r; - break; - - case BTF_KIND_ARRAY: { - struct btf_array *arr_info = btf_array(t); + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); - r = btf_dedup_remap_type_id(d, arr_info->type); - if (r < 0) - return r; - arr_info->type = r; - r = btf_dedup_remap_type_id(d, arr_info->index_type); - if (r < 0) + r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + if (r) return r; - arr_info->index_type = r; - break; - } - - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *member = btf_members(t); - __u16 vlen = btf_vlen(t); - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, member->type); - if (r < 0) - return r; - member->type = r; - member++; - } - break; } - case BTF_KIND_FUNC_PROTO: { - struct btf_param *param = btf_params(t); - __u16 vlen = btf_vlen(t); - - r = btf_dedup_remap_type_id(d, t->type); - if (r < 0) - return r; - t->type = r; - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, param->type); - if (r < 0) - return r; - param->type = r; - param++; - } - break; - } - - case BTF_KIND_DATASEC: { - struct btf_var_secinfo *var = btf_var_secinfos(t); - __u16 vlen = btf_vlen(t); - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, var->type); - if (r < 0) - return r; - var->type = r; - var++; - } - break; - } + if (!d->btf_ext) + return 0; - default: - return -EINVAL; - } + r = btf_ext_visit_type_ids(d->btf_ext, btf_dedup_remap_type_id, d); + if (r) + return r; return 0; } -static int btf_dedup_remap_types(struct btf_dedup *d) -{ - int i, r; - - for (i = 0; i < d->btf->nr_types; i++) { - r = btf_dedup_remap_type(d, d->btf->start_id + i); - if (r < 0) - return r; - } - return 0; -} - /* * Probe few well-known locations for vmlinux kernel image and try to load BTF * data out of it to use for target BTF. @@ -4626,3 +4437,200 @@ struct btf *libbpf_find_kernel_btf(void) pr_warn("failed to find valid kernel BTF\n"); return ERR_PTR(-ESRCH); } + +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) +{ + int i, n, err; + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + return 0; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + return visit(&t->type, ctx); + + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + err = visit(&a->type, ctx); + err = err ?: visit(&a->index_type, ctx); + return err; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + err = visit(&t->type, ctx); + if (err) + return err; + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_DATASEC: { + struct btf_var_secinfo *m = btf_var_secinfos(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + default: + return -EINVAL; + } +} + +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) +{ + int i, n, err; + + err = visit(&t->name_off, ctx); + if (err) + return err; + + switch (btf_kind(t)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = btf_enum(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + default: + break; + } + + return 0; +} + +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_func_info_min *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + return 0; +} + +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + } + + seg = &btf_ext->line_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_line_info_min *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->file_name_off, ctx); + if (err) + return err; + err = visit(&rec->line_off, ctx); + if (err) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->access_str_off, ctx); + if (err) + return err; + } + } + + return 0; +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 1237bcd1dd17..b54f1c3ebd57 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -93,8 +93,11 @@ LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, + const struct btf_type *src_type); LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); LIBBPF_API int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems); @@ -173,6 +176,7 @@ struct btf_dump_emit_type_decl_opts { int indent_level; /* strip all the const/volatile/restrict mods */ bool strip_mods; + size_t :0; }; #define btf_dump_emit_type_decl_opts__last_field strip_mods @@ -294,6 +298,11 @@ static inline bool btf_is_datasec(const struct btf_type *t) return btf_kind(t) == BTF_KIND_DATASEC; } +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 2f9d685bd522..5e2809d685bf 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -166,11 +166,11 @@ static int btf_dump_resize(struct btf_dump *d) if (last_id <= d->last_id) return 0; - if (btf_ensure_mem((void **)&d->type_states, &d->type_states_cap, - sizeof(*d->type_states), last_id + 1)) + if (libbpf_ensure_mem((void **)&d->type_states, &d->type_states_cap, + sizeof(*d->type_states), last_id + 1)) return -ENOMEM; - if (btf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, - sizeof(*d->cached_names), last_id + 1)) + if (libbpf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, + sizeof(*d->cached_names), last_id + 1)) return -ENOMEM; if (d->last_id == 0) { @@ -279,6 +279,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) case BTF_KIND_INT: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_FLOAT: break; case BTF_KIND_VOLATILE: @@ -453,6 +454,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) switch (btf_kind(t)) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: tstate->order_state = ORDERED; return 0; @@ -462,7 +464,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) return err; case BTF_KIND_ARRAY: - return btf_dump_order_type(d, btf_array(t)->type, through_ptr); + return btf_dump_order_type(d, btf_array(t)->type, false); case BTF_KIND_STRUCT: case BTF_KIND_UNION: { @@ -1133,6 +1135,7 @@ skip_mod: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: goto done; default: pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", @@ -1247,6 +1250,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, switch (kind) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: btf_dump_emit_mods(d, decls); name = btf_name_of(d, t->name_off); btf_dump_printf(d, "%s", name); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d43cc3f29dae..e2a3cf437814 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -55,10 +55,6 @@ #include "libbpf_internal.h" #include "hashmap.h" -#ifndef EM_BPF -#define EM_BPF 247 -#endif - #ifndef BPF_FS_MAGIC #define BPF_FS_MAGIC 0xcafe4a11 #endif @@ -73,8 +69,7 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); -static const struct btf_type * -skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) @@ -178,6 +173,8 @@ enum kern_feature_id { FEAT_PROG_BIND_MAP, /* Kernel support for module BTFs */ FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, __FEAT_CNT, }; @@ -187,7 +184,9 @@ enum reloc_type { RELO_LD64, RELO_CALL, RELO_DATA, - RELO_EXTERN, + RELO_EXTERN_VAR, + RELO_EXTERN_FUNC, + RELO_SUBPROG_ADDR, }; struct reloc_desc { @@ -195,7 +194,6 @@ struct reloc_desc { int insn_idx; int map_idx; int sym_off; - bool processed; }; struct bpf_sec_def; @@ -275,6 +273,7 @@ struct bpf_program { bpf_program_clear_priv_t clear_priv; bool load; + bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; int prog_ifindex; @@ -501,8 +500,6 @@ static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); static int elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn, GElf_Shdr *hdr); static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym); void bpf_program__unload(struct bpf_program *prog) { @@ -574,6 +571,21 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } +static bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static bool is_call_insn(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL); +} + +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + static int bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, const char *name, size_t sec_idx, const char *sec_name, @@ -628,25 +640,29 @@ static int bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, const char *sec_name, int sec_idx) { + Elf_Data *symbols = obj->efile.symbols; struct bpf_program *prog, *progs; void *data = sec_data->d_buf; - size_t sec_sz = sec_data->d_size, sec_off, prog_sz; - int nr_progs, err; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; const char *name; GElf_Sym sym; progs = obj->programs; nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(GElf_Sym); sec_off = 0; - while (sec_off < sec_sz) { - if (elf_sym_by_sec_off(obj, sec_idx, sec_off, STT_FUNC, &sym)) { - pr_warn("sec '%s': failed to find program symbol at offset %zu\n", - sec_name, sec_off); - return -LIBBPF_ERRNO__FORMAT; - } + for (i = 0; i < nr_syms; i++) { + if (!gelf_getsym(symbols, i, &sym)) + continue; + if (sym.st_shndx != sec_idx) + continue; + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; prog_sz = sym.st_size; + sec_off = sym.st_value; name = elf_sym_str(obj, sym.st_name); if (!name) { @@ -684,10 +700,17 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, if (err) return err; + /* if function is a global/weak symbol, but has hidden + * visibility (STV_HIDDEN), mark its BTF FUNC as static to + * enable more permissive BPF verification mode with more + * outside context available to BPF verifier + */ + if (GELF_ST_BIND(sym.st_info) != STB_LOCAL + && GELF_ST_VISIBILITY(sym.st_other) == STV_HIDDEN) + prog->mark_btf_static = true; + nr_progs++; obj->nr_programs = nr_progs; - - sec_off += prog_sz; } return 0; @@ -1121,11 +1144,6 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.obj_buf_sz = 0; } -/* if libelf is old and doesn't support mmap(), fall back to read() */ -#ifndef ELF_C_READ_MMAP -#define ELF_C_READ_MMAP ELF_C_READ -#endif - static int bpf_object__elf_init(struct bpf_object *obj) { int err = 0; @@ -1181,7 +1199,8 @@ static int bpf_object__elf_init(struct bpf_object *obj) if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) { pr_warn("elf: failed to get section names strings from %s: %s\n", obj->path, elf_errmsg(-1)); - return -LIBBPF_ERRNO__FORMAT; + err = -LIBBPF_ERRNO__FORMAT; + goto errout; } /* Old LLVM set e_machine to EM_NONE */ @@ -1885,7 +1904,7 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) return 0; } -static const struct btf_type * +const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { const struct btf_type *t = btf__type_by_id(btf, id); @@ -1916,9 +1935,9 @@ resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) return btf_is_func_proto(t) ? t : NULL; } -static const char *btf_kind_str(const struct btf_type *t) +static const char *__btf_kind_str(__u16 kind) { - switch (btf_kind(t)) { + switch (kind) { case BTF_KIND_UNKN: return "void"; case BTF_KIND_INT: return "int"; case BTF_KIND_PTR: return "ptr"; @@ -1935,10 +1954,16 @@ static const char *btf_kind_str(const struct btf_type *t) case BTF_KIND_FUNC_PROTO: return "func_proto"; case BTF_KIND_VAR: return "var"; case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; default: return "unknown"; } } +const char *btf_kind_str(const struct btf_type *t) +{ + return __btf_kind_str(btf_kind(t)); +} + /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -1993,254 +2018,262 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return bpf_map__set_pin_path(map, buf); } - -static int parse_btf_map_def(struct bpf_object *obj, - struct bpf_map *map, - const struct btf_type *def, - bool strict, bool is_inner, - const char *pin_root_path) +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) { const struct btf_type *t; const struct btf_member *m; + bool is_inner = inner_def == NULL; int vlen, i; - vlen = btf_vlen(def); - m = btf_members(def); + vlen = btf_vlen(def_t); + m = btf_members(def_t); for (i = 0; i < vlen; i++, m++) { - const char *name = btf__name_by_offset(obj->btf, m->name_off); + const char *name = btf__name_by_offset(btf, m->name_off); if (!name) { - pr_warn("map '%s': invalid field #%d.\n", map->name, i); + pr_warn("map '%s': invalid field #%d.\n", map_name, i); return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.type)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) return -EINVAL; - pr_debug("map '%s': found type = %u.\n", - map->name, map->def.type); + map_def->parts |= MAP_DEF_MAP_TYPE; } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.max_entries)) + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) return -EINVAL; - pr_debug("map '%s': found max_entries = %u.\n", - map->name, map->def.max_entries); + map_def->parts |= MAP_DEF_MAX_ENTRIES; } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.map_flags)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) return -EINVAL; - pr_debug("map '%s': found map_flags = %u.\n", - map->name, map->def.map_flags); + map_def->parts |= MAP_DEF_MAP_FLAGS; } else if (strcmp(name, "numa_node") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, &map->numa_node)) + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) return -EINVAL; - pr_debug("map '%s': found numa_node = %u.\n", map->name, map->numa_node); + map_def->parts |= MAP_DEF_NUMA_NODE; } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found key_size = %u.\n", - map->name, sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %u.\n", - map->name, map->def.key_size, sz); + map_name, map_def->key_size, sz); return -EINVAL; } - map->def.key_size = sz; + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; } else if (strcmp(name, "key") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': key type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': key spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found key [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %zd.\n", - map->name, map->def.key_size, (ssize_t)sz); + map_name, map_def->key_size, (ssize_t)sz); return -EINVAL; } - map->def.key_size = sz; - map->btf_key_type_id = t->type; + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found value_size = %u.\n", - map->name, sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %u.\n", - map->name, map->def.value_size, sz); + map_name, map_def->value_size, sz); return -EINVAL; } - map->def.value_size = sz; + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; } else if (strcmp(name, "value") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': value type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': value spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found value [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %zd.\n", - map->name, map->def.value_size, (ssize_t)sz); + map_name, map_def->value_size, (ssize_t)sz); return -EINVAL; } - map->def.value_size = sz; - map->btf_value_type_id = t->type; + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; } else if (strcmp(name, "values") == 0) { + char inner_map_name[128]; int err; if (is_inner) { pr_warn("map '%s': multi-level inner maps not supported.\n", - map->name); + map_name); return -ENOTSUP; } if (i != vlen - 1) { pr_warn("map '%s': '%s' member should be last.\n", - map->name, name); + map_name, name); return -EINVAL; } - if (!bpf_map_type__is_map_in_map(map->def.type)) { + if (!bpf_map_type__is_map_in_map(map_def->map_type)) { pr_warn("map '%s': should be map-in-map.\n", - map->name); + map_name); return -ENOTSUP; } - if (map->def.value_size && map->def.value_size != 4) { + if (map_def->value_size && map_def->value_size != 4) { pr_warn("map '%s': conflicting value size %u != 4.\n", - map->name, map->def.value_size); + map_name, map_def->value_size); return -EINVAL; } - map->def.value_size = 4; - t = btf__type_by_id(obj->btf, m->type); + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': map-in-map inner type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_array(t) || btf_array(t)->nelems) { pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", - map->name); + map_name); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type, - NULL); + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); if (!btf_is_ptr(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + t = skip_mods_and_typedefs(btf, t->type, NULL); if (!btf_is_struct(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - map->inner_map = calloc(1, sizeof(*map->inner_map)); - if (!map->inner_map) - return -ENOMEM; - map->inner_map->sec_idx = obj->efile.btf_maps_shndx; - map->inner_map->name = malloc(strlen(map->name) + - sizeof(".inner") + 1); - if (!map->inner_map->name) - return -ENOMEM; - sprintf(map->inner_map->name, "%s.inner", map->name); - - err = parse_btf_map_def(obj, map->inner_map, t, strict, - true /* is_inner */, NULL); + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); if (err) return err; + + map_def->parts |= MAP_DEF_INNER_MAP; } else if (strcmp(name, "pinning") == 0) { __u32 val; - int err; if (is_inner) { - pr_debug("map '%s': inner def can't be pinned.\n", - map->name); + pr_warn("map '%s': inner def can't be pinned.\n", map_name); return -EINVAL; } - if (!get_map_field_int(map->name, obj->btf, m, &val)) + if (!get_map_field_int(map_name, btf, m, &val)) return -EINVAL; - pr_debug("map '%s': found pinning = %u.\n", - map->name, val); - - if (val != LIBBPF_PIN_NONE && - val != LIBBPF_PIN_BY_NAME) { + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { pr_warn("map '%s': invalid pinning value %u.\n", - map->name, val); + map_name, val); return -EINVAL; } - if (val == LIBBPF_PIN_BY_NAME) { - err = build_map_pin_path(map, pin_root_path); - if (err) { - pr_warn("map '%s': couldn't build pin path.\n", - map->name); - return err; - } - } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; } else { if (strict) { - pr_warn("map '%s': unknown field '%s'.\n", - map->name, name); + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); return -ENOTSUP; } - pr_debug("map '%s': ignoring unknown field '%s'.\n", - map->name, name); + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); } } - if (map->def.type == BPF_MAP_TYPE_UNSPEC) { - pr_warn("map '%s': map type isn't specified.\n", map->name); + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); return -EINVAL; } return 0; } +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = %u.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + static int bpf_object__init_user_btf_map(struct bpf_object *obj, const struct btf_type *sec, int var_idx, int sec_idx, const Elf_Data *data, bool strict, const char *pin_root_path) { + struct btf_map_def map_def = {}, inner_def = {}; const struct btf_type *var, *def; const struct btf_var_secinfo *vi; const struct btf_var *var_extra; const char *map_name; struct bpf_map *map; + int err; vi = btf_var_secinfos(sec) + var_idx; var = btf__type_by_id(obj->btf, vi->type); @@ -2294,7 +2327,35 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, pr_debug("map '%s': at sec_idx %d, offset %zu.\n", map_name, map->sec_idx, map->sec_offset); - return parse_btf_map_def(obj, map, def, strict, false, pin_root_path); + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + return 0; } static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, @@ -2384,15 +2445,17 @@ static bool btf_needs_sanitization(struct bpf_object *obj) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); - return !has_func || !has_datasec || !has_func_global; + return !has_func || !has_datasec || !has_func_global || !has_float; } static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); struct btf_type *t; int i, j, vlen; @@ -2445,6 +2508,13 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) } else if (!has_func_global && btf_is_func(t)) { /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); } } } @@ -2587,7 +2657,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) { struct btf *kern_btf = obj->btf; bool btf_mandatory, sanitize; - int err = 0; + int i, err = 0; if (!obj->btf) return 0; @@ -2601,6 +2671,38 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) return 0; } + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__get_nr_types(obj->btf); + for (j = 1; j <= n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + sanitize = btf_needs_sanitization(obj); if (sanitize) { const void *raw_data; @@ -2751,26 +2853,6 @@ static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) return data; } -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym) -{ - Elf_Data *symbols = obj->efile.symbols; - size_t n = symbols->d_size / sizeof(GElf_Sym); - int i; - - for (i = 0; i < n; i++) { - if (!gelf_getsym(symbols, i, sym)) - continue; - if (sym->st_shndx != sec_idx || sym->st_value != off) - continue; - if (GELF_ST_TYPE(sym->st_info) != sym_type) - continue; - return 0; - } - - return -ENOENT; -} - static bool is_sec_name_dwarf(const char *name) { /* approximation, but the actual list is too long */ @@ -2784,7 +2866,7 @@ static bool ignore_elf_section(GElf_Shdr *hdr, const char *name) return true; /* ignore .llvm_addrsig section as well */ - if (hdr->sh_type == 0x6FFF4C03 /* SHT_LLVM_ADDRSIG */) + if (hdr->sh_type == SHT_LLVM_ADDRSIG) return true; /* no subprograms will lead to an empty .text section, ignore it */ @@ -2974,10 +3056,27 @@ static bool sym_is_extern(const GElf_Sym *sym) GELF_ST_TYPE(sym->st_info) == STT_NOTYPE; } +static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx) +{ + int bind = GELF_ST_BIND(sym->st_info); + int type = GELF_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + static int find_extern_btf_id(const struct btf *btf, const char *ext_name) { const struct btf_type *t; - const char *var_name; + const char *tname; int i, n; if (!btf) @@ -2987,14 +3086,18 @@ static int find_extern_btf_id(const struct btf *btf, const char *ext_name) for (i = 1; i <= n; i++) { t = btf__type_by_id(btf, i); - if (!btf_is_var(t)) + if (!btf_is_var(t) && !btf_is_func(t)) continue; - var_name = btf__name_by_offset(btf, t->name_off); - if (strcmp(var_name, ext_name)) + tname = btf__name_by_offset(btf, t->name_off); + if (strcmp(tname, ext_name)) continue; - if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + if (btf_is_var(t) && + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) return -EINVAL; return i; @@ -3107,12 +3210,48 @@ static int find_int_btf_id(const struct btf *btf) return 0; } +static int add_dummy_ksym_var(struct btf *btf) +{ + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; + const struct btf_var_secinfo *vs; + const struct btf_type *sec; + + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, + BTF_KIND_DATASEC); + if (sec_btf_id < 0) + return 0; + + sec = btf__type_by_id(btf, sec_btf_id); + vs = btf_var_secinfos(sec); + for (i = 0; i < btf_vlen(sec); i++, vs++) { + const struct btf_type *vt; + + vt = btf__type_by_id(btf, vs->type); + if (btf_is_func(vt)) + break; + } + + /* No func in ksyms sec. No need to add dummy var. */ + if (i == btf_vlen(sec)) + return 0; + + int_btf_id = find_int_btf_id(btf); + dummy_var_btf_id = btf__add_var(btf, + "dummy_ksym", + BTF_VAR_GLOBAL_ALLOCATED, + int_btf_id); + if (dummy_var_btf_id < 0) + pr_warn("cannot create a dummy_ksym var\n"); + + return dummy_var_btf_id; +} + static int bpf_object__collect_externs(struct bpf_object *obj) { struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; const struct btf_type *t; struct extern_desc *ext; - int i, n, off; + int i, n, off, dummy_var_btf_id; const char *ext_name, *sec_name; Elf_Scn *scn; GElf_Shdr sh; @@ -3124,6 +3263,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj) if (elf_sec_hdr(obj, scn, &sh)) return -LIBBPF_ERRNO__FORMAT; + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); + if (dummy_var_btf_id < 0) + return dummy_var_btf_id; + n = sh.sh_size / sh.sh_entsize; pr_debug("looking for externs among %d symbols...\n", n); @@ -3168,6 +3311,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj) sec_name = btf__name_by_offset(obj->btf, sec->name_off); if (strcmp(sec_name, KCONFIG_SEC) == 0) { + if (btf_is_func(t)) { + pr_warn("extern function %s is unsupported under %s section\n", + ext->name, KCONFIG_SEC); + return -ENOTSUP; + } kcfg_sec = sec; ext->type = EXT_KCFG; ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); @@ -3189,6 +3337,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return -ENOTSUP; } } else if (strcmp(sec_name, KSYMS_SEC) == 0) { + if (btf_is_func(t) && ext->is_weak) { + pr_warn("extern weak function %s is unsupported\n", + ext->name); + return -ENOTSUP; + } ksym_sec = sec; ext->type = EXT_KSYM; skip_mods_and_typedefs(obj->btf, t->type, @@ -3215,7 +3368,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj) * extern variables in DATASEC */ int int_btf_id = find_int_btf_id(obj->btf); + /* For extern function, a dummy_var added earlier + * will be used to replace the vs->type and + * its name string will be used to refill + * the missing param's name. + */ + const struct btf_type *dummy_var; + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; if (ext->type != EXT_KSYM) @@ -3234,12 +3394,32 @@ static int bpf_object__collect_externs(struct bpf_object *obj) ext_name = btf__name_by_offset(obj->btf, vt->name_off); ext = find_extern_by_name(obj, ext_name); if (!ext) { - pr_warn("failed to find extern definition for BTF var '%s'\n", - ext_name); + pr_warn("failed to find extern definition for BTF %s '%s'\n", + btf_kind_str(vt), ext_name); return -ESRCH; } - btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; - vt->type = int_btf_id; + if (btf_is_func(vt)) { + const struct btf_type *func_proto; + struct btf_param *param; + int j; + + func_proto = btf__type_by_id(obj->btf, + vt->type); + param = btf_params(func_proto); + /* Reuse the dummy_var string if the + * func proto does not have param name. + */ + for (j = 0; j < btf_vlen(func_proto); j++) + if (param[j].type && !param[j].name_off) + param[j].name_off = + dummy_var->name_off; + vs->type = dummy_var_btf_id; + vt->info &= ~0xffff; + vt->info |= BTF_FUNC_GLOBAL; + } else { + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vt->type = int_btf_id; + } vs->offset = off; vs->size = sizeof(int); } @@ -3369,33 +3549,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, const char *sym_sec_name; struct bpf_map *map; - reloc_desc->processed = false; - - /* sub-program call relocation */ - if (insn->code == (BPF_JMP | BPF_CALL)) { - if (insn->src_reg != BPF_PSEUDO_CALL) { - pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); - return -LIBBPF_ERRNO__RELOC; - } - /* text_shndx can be 0, if no default "main" program exists */ - if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { - sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); - pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", - prog->name, sym_name, sym_sec_name); - return -LIBBPF_ERRNO__RELOC; - } - if (sym->st_value % BPF_INSN_SZ) { - pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", - prog->name, sym_name, (size_t)sym->st_value); - return -LIBBPF_ERRNO__RELOC; - } - reloc_desc->type = RELO_CALL; - reloc_desc->insn_idx = insn_idx; - reloc_desc->sym_off = sym->st_value; - return 0; - } - - if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; @@ -3418,18 +3572,62 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", prog->name, i, ext->name, ext->sym_idx, insn_idx); - reloc_desc->type = RELO_EXTERN; + if (insn->code == (BPF_JMP | BPF_CALL)) + reloc_desc->type = RELO_EXTERN_FUNC; + else + reloc_desc->type = RELO_EXTERN_VAR; reloc_desc->insn_idx = insn_idx; reloc_desc->sym_off = i; /* sym_off stores extern index */ return 0; } + /* sub-program call relocation */ + if (is_call_insn(insn)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", + prog->name, sym_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n", prog->name, sym_name, shdr_idx); return -LIBBPF_ERRNO__RELOC; } + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); @@ -3533,11 +3731,16 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data int err, i, nrels; const char *sym_name; __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; GElf_Sym sym; GElf_Rel rel; + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + relo_sec_name = elf_sec_str(obj, shdr->sh_name); - sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + sec_name = elf_sec_name(obj, scn); if (!relo_sec_name || !sec_name) return -EINVAL; @@ -3555,7 +3758,8 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; } - if (rel.r_offset % BPF_INSN_SZ) { + + if (rel.r_offset % BPF_INSN_SZ || rel.r_offset >= scn_data->d_size) { pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; @@ -3579,9 +3783,9 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); if (!prog) { - pr_warn("sec '%s': relo #%d: program not found in section '%s' for insn #%u\n", + pr_debug("sec '%s': relo #%d: couldn't find program in section '%s' for insn #%u, probably overridden weak function, skipping...\n", relo_sec_name, i, sec_name, insn_idx); - return -LIBBPF_ERRNO__RELOC; + continue; } relos = libbpf_reallocarray(prog->reloc_desc, @@ -3696,6 +3900,14 @@ __u32 bpf_map__max_entries(const struct bpf_map *map) return map->def.max_entries; } +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return NULL; + + return map->inner_map; +} + int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { if (map->fd >= 0) @@ -3882,6 +4094,18 @@ static int probe_kern_btf_datasec(void) strs, sizeof(strs))); } +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + static int probe_kern_array_mmap(void) { struct bpf_create_map_attr attr = { @@ -4061,6 +4285,9 @@ static struct kern_feature_desc { [FEAT_MODULE_BTF] = { "module BTF support", probe_module_btf, }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, }; static bool kernel_supports(enum kern_feature_id feat_id) @@ -4795,8 +5022,8 @@ static int load_module_btfs(struct bpf_object *obj) goto err_out; } - err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, - sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); if (err) goto err_out; @@ -4888,6 +5115,7 @@ err_out: * least one of enums should be anonymous; * - for ENUMs, check sizes, names are ignored; * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; * - everything else shouldn't be ever a target of relocation. @@ -4914,6 +5142,7 @@ recur: switch (btf_kind(local_type)) { case BTF_KIND_PTR: + case BTF_KIND_FLOAT: return 1; case BTF_KIND_FWD: case BTF_KIND_ENUM: { @@ -5566,11 +5795,6 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ } -static bool is_ldimm64(struct bpf_insn *insn) -{ - return insn->code == (BPF_LD | BPF_IMM | BPF_DW); -} - static int insn_bpf_size_to_bytes(struct bpf_insn *insn) { switch (BPF_SIZE(insn->code)) { @@ -5636,7 +5860,7 @@ poison: /* poison second part of ldimm64 to avoid confusing error from * verifier about "unknown opcode 00" */ - if (is_ldimm64(insn)) + if (is_ldimm64_insn(insn)) bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1); bpf_core_poison_insn(prog, relo_idx, insn_idx, insn); return 0; @@ -5712,7 +5936,7 @@ poison: case BPF_LD: { __u64 imm; - if (!is_ldimm64(insn) || + if (!is_ldimm64_insn(insn) || insn[0].src_reg != 0 || insn[0].off != 0 || insn_idx + 1 >= prog->insns_cnt || insn[1].code != 0 || insn[1].dst_reg != 0 || @@ -6023,8 +6247,8 @@ patch_insn: /* bpf_core_patch_insn() should know how to handle missing targ_spec */ err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res); if (err) { - pr_warn("prog '%s': relo #%d: failed to patch insn at offset %d: %d\n", - prog->name, relo_idx, relo->insn_off, err); + pr_warn("prog '%s': relo #%d: failed to patch insn #%zu: %d\n", + prog->name, relo_idx, relo->insn_off / BPF_INSN_SZ, err); return -EINVAL; } @@ -6146,15 +6370,13 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_LD64: insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_DATA: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[1].imm = insn[0].imm + relo->sym_off; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; - case RELO_EXTERN: + case RELO_EXTERN_VAR: ext = &obj->externs[relo->sym_off]; if (ext->type == EXT_KCFG) { insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; @@ -6170,7 +6392,15 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[1].imm = ext->ksym.addr >> 32; } } - relo->processed = true; + break; + case RELO_EXTERN_FUNC: + ext = &obj->externs[relo->sym_off]; + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; + insn[0].imm = ext->ksym.kernel_btf_id; + break; + case RELO_SUBPROG_ADDR: + insn[0].src_reg = BPF_PSEUDO_FUNC; + /* will be handled as a follow up pass */ break; case RELO_CALL: /* will be handled as a follow up pass */ @@ -6358,11 +6588,11 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; - if (!insn_is_subprog_call(insn)) + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) continue; relo = find_prog_insn_relo(prog, insn_idx); - if (relo && relo->type != RELO_CALL) { + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", prog->name, insn_idx, relo->type); return -LIBBPF_ERRNO__RELOC; @@ -6374,8 +6604,22 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * call always has imm = -1, but for static functions * relocation is against STT_SECTION and insn->imm * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. + */ + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. */ - sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; } else { /* if subprogram call is to a static function within * the same ELF section, there won't be any relocation @@ -6438,9 +6682,6 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * different main programs */ insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; - if (relo) - relo->processed = true; - pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); } @@ -6533,7 +6774,7 @@ static int bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_program *subprog; - int i, j, err; + int i, err; /* mark all subprogs as not relocated (yet) within the context of * current main program @@ -6544,9 +6785,6 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) continue; subprog->sub_insn_off = 0; - for (j = 0; j < subprog->nr_reloc; j++) - if (subprog->reloc_desc[j].type == RELO_CALL) - subprog->reloc_desc[j].processed = false; } err = bpf_object__reloc_code(obj, prog, prog); @@ -6793,7 +7031,7 @@ static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id return false; } -static int bpf_object__sanitize_prog(struct bpf_object* obj, struct bpf_program *prog) +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_insn *insn = prog->insns; enum bpf_func_id func_id; @@ -7274,6 +7512,7 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj) { char sym_type, sym_name[500]; unsigned long long sym_addr; + const struct btf_type *t; struct extern_desc *ext; int ret, err = 0; FILE *f; @@ -7300,6 +7539,10 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj) if (!ext || ext->type != EXT_KSYM) continue; + t = btf__type_by_id(obj->btf, ext->btf_id); + if (!btf_is_var(t)) + continue; + if (ext->is_set && ext->ksym.addr != sym_addr) { pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n", sym_name, ext->ksym.addr, sym_addr); @@ -7318,75 +7561,151 @@ out: return err; } -static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + int *res_btf_fd) { - struct extern_desc *ext; + int i, id, btf_fd, err; struct btf *btf; - int i, j, id, btf_fd, err; - for (i = 0; i < obj->nr_extern; i++) { - const struct btf_type *targ_var, *targ_type; - __u32 targ_type_id, local_type_id; - const char *targ_var_name; - int ret; - - ext = &obj->externs[i]; - if (ext->type != EXT_KSYM || !ext->ksym.type_id) - continue; + btf = obj->btf_vmlinux; + btf_fd = 0; + id = btf__find_by_name_kind(btf, ksym_name, kind); - btf = obj->btf_vmlinux; - btf_fd = 0; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); - if (id == -ENOENT) { - err = load_module_btfs(obj); - if (err) - return err; + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; - for (j = 0; j < obj->btf_module_cnt; j++) { - btf = obj->btf_modules[j].btf; - /* we assume module BTF FD is always >0 */ - btf_fd = obj->btf_modules[j].fd; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); - if (id != -ENOENT) - break; - } - } - if (id <= 0) { - pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", - ext->name); - return -ESRCH; + for (i = 0; i < obj->btf_module_cnt; i++) { + btf = obj->btf_modules[i].btf; + /* we assume module BTF FD is always >0 */ + btf_fd = obj->btf_modules[i].fd; + id = btf__find_by_name_kind(btf, ksym_name, kind); + if (id != -ENOENT) + break; } + } + if (id <= 0) { + pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", + __btf_kind_str(kind), ksym_name); + return -ESRCH; + } + + *res_btf = btf; + *res_btf_fd = btf_fd; + return id; +} - /* find local type_id */ - local_type_id = ext->ksym.type_id; +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + const char *targ_var_name; + int id, btf_fd = 0, err; + struct btf *btf = NULL; - /* find target type_id */ - targ_var = btf__type_by_id(btf, id); - targ_var_name = btf__name_by_offset(btf, targ_var->name_off); - targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd); + if (id < 0) + return id; - ret = bpf_core_types_are_compat(obj->btf, local_type_id, - btf, targ_type_id); - if (ret <= 0) { - const struct btf_type *local_type; - const char *targ_name, *local_name; + /* find local type_id */ + local_type_id = ext->ksym.type_id; - local_type = btf__type_by_id(obj->btf, local_type_id); - local_name = btf__name_by_offset(obj->btf, local_type->name_off); - targ_name = btf__name_by_offset(btf, targ_type->name_off); + /* find target type_id */ + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); - pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", - ext->name, local_type_id, - btf_kind_str(local_type), local_name, targ_type_id, - btf_kind_str(targ_type), targ_name); - return -EINVAL; - } + err = bpf_core_types_are_compat(obj->btf, local_type_id, + btf, targ_type_id); + if (err <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; + + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); + + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = btf_fd; + ext->ksym.kernel_btf_id = id; + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); + + return 0; +} + +static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + int local_func_proto_id, kfunc_proto_id, kfunc_id; + const struct btf_type *kern_func; + struct btf *kern_btf = NULL; + int ret, kern_btf_fd = 0; + + local_func_proto_id = ext->ksym.type_id; + + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, + &kern_btf, &kern_btf_fd); + if (kfunc_id < 0) { + pr_warn("extern (func ksym) '%s': not found in kernel BTF\n", + ext->name); + return kfunc_id; + } + + if (kern_btf != obj->btf_vmlinux) { + pr_warn("extern (func ksym) '%s': function in kernel module is not supported\n", + ext->name); + return -ENOTSUP; + } - ext->is_set = true; - ext->ksym.kernel_btf_obj_fd = btf_fd; - ext->ksym.kernel_btf_id = id; - pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", - ext->name, id, btf_kind_str(targ_var), targ_var_name); + kern_func = btf__type_by_id(kern_btf, kfunc_id); + kfunc_proto_id = kern_func->type; + + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, + kern_btf, kfunc_proto_id); + if (ret <= 0) { + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", + ext->name, local_func_proto_id, kfunc_proto_id); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = kern_btf_fd; + ext->ksym.kernel_btf_id = kfunc_id; + pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", + ext->name, kfunc_id); + + return 0; +} + +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + const struct btf_type *t; + struct extern_desc *ext; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; + + t = btf__type_by_id(obj->btf, ext->btf_id); + if (btf_is_var(t)) + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + else + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); + if (err) + return err; } return 0; } @@ -8193,6 +8512,16 @@ int bpf_object__btf_fd(const struct bpf_object *obj) return obj->btf ? btf__fd(obj->btf) : -1; } +int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) +{ + if (obj->loaded) + return -EINVAL; + + obj->kern_version = kern_version; + + return 0; +} + int bpf_object__set_priv(struct bpf_object *obj, void *priv, bpf_object_clear_priv_t clear_priv) { @@ -8381,7 +8710,7 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n) return fd; } -enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog) +enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog) { return prog->type; } @@ -8426,7 +8755,7 @@ BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); enum bpf_attach_type -bpf_program__get_expected_attach_type(struct bpf_program *prog) +bpf_program__get_expected_attach_type(const struct bpf_program *prog) { return prog->expected_attach_type; } @@ -9202,6 +9531,7 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) pr_warn("error: inner_map_fd already specified\n"); return -EINVAL; } + zfree(&map->inner_map); map->inner_map_fd = fd; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3c35eb401931..bec4e6a6e31d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -143,6 +143,7 @@ LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); +LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); struct btf; LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); @@ -361,12 +362,12 @@ LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); -LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); +LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type); LIBBPF_API enum bpf_attach_type -bpf_program__get_expected_attach_type(struct bpf_program *prog); +bpf_program__get_expected_attach_type(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type); @@ -479,6 +480,7 @@ LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); LIBBPF_API long libbpf_get_error(const void *ptr); @@ -507,6 +509,7 @@ struct xdp_link_info { struct bpf_xdp_set_link_opts { size_t sz; int old_fd; + size_t :0; }; #define bpf_xdp_set_link_opts__last_field old_fd @@ -759,6 +762,19 @@ enum libbpf_tristate { TRI_MODULE = 2, }; +struct bpf_linker_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; +}; +#define bpf_linker_opts__last_field sz + +struct bpf_linker; + +LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts); +LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filename); +LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); +LIBBPF_API void bpf_linker__free(struct bpf_linker *linker); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 1c0fd2dd233a..b9b29baf1df8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -350,3 +350,15 @@ LIBBPF_0.3.0 { xsk_setup_xdp_prog; xsk_socket__update_xskmap; } LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; + btf__add_type; + bpf_linker__add_file; + bpf_linker__finalize; + bpf_linker__free; + bpf_linker__new; + bpf_map__inner_map; + bpf_object__set_kversion; +} LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 969d0ac592ba..ee426226928f 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -19,6 +19,27 @@ #pragma GCC poison reallocarray #include "libbpf.h" +#include "btf.h" + +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +#ifndef R_BPF_64_64 +#define R_BPF_64_64 1 +#endif +#ifndef R_BPF_64_32 +#define R_BPF_64_32 10 +#endif + +#ifndef SHT_LLVM_ADDRSIG +#define SHT_LLVM_ADDRSIG 0x6FFF4C03 +#endif + +/* if libelf is old and doesn't support mmap(), fall back to read() */ +#ifndef ELF_C_READ_MMAP +#define ELF_C_READ_MMAP ELF_C_READ +#endif #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) @@ -31,6 +52,8 @@ #define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) #define BTF_PARAM_ENC(name, type) (name), (type) #define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) @@ -105,9 +128,58 @@ static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) return realloc(ptr, total); } -void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, - size_t cur_cnt, size_t max_cnt, size_t add_cnt); -int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); +struct btf; +struct btf_type; + +struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} + +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + + MAP_DEF_ALL = 0x3ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); + +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt); +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); static inline bool libbpf_validate_opts(const char *opts, size_t opts_sz, size_t user_sz, @@ -349,4 +421,11 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); +typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h deleted file mode 100644 index 59c779c5790c..000000000000 --- a/tools/lib/bpf/libbpf_util.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -/* Copyright (c) 2019 Facebook */ - -#ifndef __LIBBPF_LIBBPF_UTIL_H -#define __LIBBPF_LIBBPF_UTIL_H - -#include <stdbool.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* Use these barrier functions instead of smp_[rw]mb() when they are - * used in a libbpf header file. That way they can be built into the - * application that uses libbpf. - */ -#if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_rmb() asm volatile("" : : : "memory") -# define libbpf_smp_wmb() asm volatile("" : : : "memory") -# define libbpf_smp_mb() \ - asm volatile("lock; addl $0,-4(%%rsp)" : : : "memory", "cc") -/* Hinders stores to be observed before older loads. */ -# define libbpf_smp_rwmb() asm volatile("" : : : "memory") -#elif defined(__aarch64__) -# define libbpf_smp_rmb() asm volatile("dmb ishld" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#elif defined(__arm__) -/* These are only valid for armv7 and above */ -# define libbpf_smp_rmb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#else -/* Architecture missing native barrier functions. */ -# define libbpf_smp_rmb() __sync_synchronize() -# define libbpf_smp_wmb() __sync_synchronize() -# define libbpf_smp_mb() __sync_synchronize() -# define libbpf_smp_rwmb() __sync_synchronize() -#endif - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c new file mode 100644 index 000000000000..9de084b1c699 --- /dev/null +++ b/tools/lib/bpf/linker.c @@ -0,0 +1,2883 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * BPF static linker + * + * Copyright (c) 2021 Facebook + */ +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <linux/err.h> +#include <linux/btf.h> +#include <elf.h> +#include <libelf.h> +#include <gelf.h> +#include <fcntl.h> +#include "libbpf.h" +#include "btf.h" +#include "libbpf_internal.h" +#include "strset.h" + +#define BTF_EXTERN_SEC ".extern" + +struct src_sec { + const char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + /* positional (not necessarily ELF) index of a matching section in a final object file */ + int dst_id; + /* section data offset in a matching output section */ + int dst_off; + /* whether section is omitted from the final ELF file */ + bool skipped; + /* whether section is an ephemeral section, not mapped to an ELF section */ + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* corresponding BTF DATASEC type ID */ + int sec_type_id; +}; + +struct src_obj { + const char *filename; + int fd; + Elf *elf; + /* Section header strings section index */ + size_t shstrs_sec_idx; + /* SYMTAB section index */ + size_t symtab_sec_idx; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* List of sections (including ephemeral). Slot zero is unused. */ + struct src_sec *secs; + int sec_cnt; + + /* mapping of symbol indices from src to dst ELF */ + int *sym_map; + /* mapping from the src BTF type IDs to dst ones */ + int *btf_type_map; +}; + +/* single .BTF.ext data section */ +struct btf_ext_sec_data { + size_t rec_cnt; + __u32 rec_sz; + void *recs; +}; + +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + +struct dst_sec { + char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* final output section size */ + int sec_sz; + /* final output contents of the section */ + void *raw_data; + + /* corresponding STT_SECTION symbol index in SYMTAB */ + int sec_sym_idx; + + /* section's DATASEC variable info, emitted on BTF finalization */ + bool has_btf; + int sec_var_cnt; + struct btf_var_secinfo *sec_vars; + + /* section's .BTF.ext data */ + struct btf_ext_sec_data func_info; + struct btf_ext_sec_data line_info; + struct btf_ext_sec_data core_relo_info; +}; + +struct bpf_linker { + char *filename; + int fd; + Elf *elf; + Elf64_Ehdr *elf_hdr; + + /* Output sections metadata */ + struct dst_sec *secs; + int sec_cnt; + + struct strset *strtab_strs; /* STRTAB unique strings */ + size_t strtab_sec_idx; /* STRTAB section index */ + size_t symtab_sec_idx; /* SYMTAB section index */ + + struct btf *btf; + struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; +}; + +#define pr_warn_elf(fmt, ...) \ + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) + +static int init_output_elf(struct bpf_linker *linker, const char *file); + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); +static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_btf(struct src_obj *obj); +static int linker_sanity_check_btf_ext(struct src_obj *obj); +static int linker_fixup_btf(struct src_obj *obj); +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); + +static int finalize_btf(struct bpf_linker *linker); +static int finalize_btf_ext(struct bpf_linker *linker); + +void bpf_linker__free(struct bpf_linker *linker) +{ + int i; + + if (!linker) + return; + + free(linker->filename); + + if (linker->elf) + elf_end(linker->elf); + + if (linker->fd >= 0) + close(linker->fd); + + strset__free(linker->strtab_strs); + + btf__free(linker->btf); + btf_ext__free(linker->btf_ext); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + free(sec->sec_name); + free(sec->raw_data); + free(sec->sec_vars); + + free(sec->func_info.recs); + free(sec->line_info.recs); + free(sec->core_relo_info.recs); + } + free(linker->secs); + + free(linker); +} + +struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts) +{ + struct bpf_linker *linker; + int err; + + if (!OPTS_VALID(opts, bpf_linker_opts)) + return NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn_elf("libelf initialization failed"); + return NULL; + } + + linker = calloc(1, sizeof(*linker)); + if (!linker) + return NULL; + + linker->fd = -1; + + err = init_output_elf(linker, filename); + if (err) + goto err_out; + + return linker; + +err_out: + bpf_linker__free(linker); + return NULL; +} + +static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *secs = linker->secs, *sec; + size_t new_cnt = linker->sec_cnt ? linker->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + linker->sec_cnt, 0, (new_cnt - linker->sec_cnt) * sizeof(*secs)); + + linker->secs = secs; + linker->sec_cnt = new_cnt; + + sec = &linker->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = strdup(sec_name); + if (!sec->sec_name) + return NULL; + + return sec; +} + +static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms, *sym; + size_t sym_cnt = symtab->sec_sz / sizeof(*sym); + + syms = libbpf_reallocarray(symtab->raw_data, sym_cnt + 1, sizeof(*sym)); + if (!syms) + return NULL; + + sym = &syms[sym_cnt]; + memset(sym, 0, sizeof(*sym)); + + symtab->raw_data = syms; + symtab->sec_sz += sizeof(*sym); + symtab->shdr->sh_size += sizeof(*sym); + symtab->data->d_size += sizeof(*sym); + + if (sym_idx) + *sym_idx = sym_cnt; + + return sym; +} + +static int init_output_elf(struct bpf_linker *linker, const char *file) +{ + int err, str_off; + Elf64_Sym *init_sym; + struct dst_sec *sec; + + linker->filename = strdup(file); + if (!linker->filename) + return -ENOMEM; + + linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (linker->fd < 0) { + err = -errno; + pr_warn("failed to create '%s': %d\n", file, err); + return err; + } + + linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL); + if (!linker->elf) { + pr_warn_elf("failed to create ELF object"); + return -EINVAL; + } + + /* ELF header */ + linker->elf_hdr = elf64_newehdr(linker->elf); + if (!linker->elf_hdr) { + pr_warn_elf("failed to create ELF header"); + return -EINVAL; + } + + linker->elf_hdr->e_machine = EM_BPF; + linker->elf_hdr->e_type = ET_REL; +#if __BYTE_ORDER == __LITTLE_ENDIAN + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER" +#endif + + /* STRTAB */ + /* initialize strset with an empty string to conform to ELF */ + linker->strtab_strs = strset__new(INT_MAX, "", sizeof("")); + if (libbpf_get_error(linker->strtab_strs)) + return libbpf_get_error(linker->strtab_strs); + + sec = add_dst_sec(linker, ".strtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create STRTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create STRTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->elf_hdr->e_shstrndx = sec->sec_idx; + linker->strtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_STRTAB; + sec->shdr->sh_flags = SHF_STRINGS; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = 0; + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 1; + sec->shdr->sh_size = sec->sec_sz = 0; + sec->shdr->sh_entsize = 0; + + /* SYMTAB */ + sec = add_dst_sec(linker, ".symtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create SYMTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create SYMTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->symtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_SYMTAB; + sec->shdr->sh_flags = 0; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = linker->strtab_sec_idx; + /* sh_info should be one greater than the index of the last local + * symbol (i.e., binding is STB_LOCAL). But why and who cares? + */ + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 8; + sec->shdr->sh_entsize = sizeof(Elf64_Sym); + + /* .BTF */ + linker->btf = btf__new_empty(); + err = libbpf_get_error(linker->btf); + if (err) + return err; + + /* add the special all-zero symbol */ + init_sym = add_new_sym(linker, NULL); + if (!init_sym) + return -EINVAL; + + init_sym->st_name = 0; + init_sym->st_info = 0; + init_sym->st_other = 0; + init_sym->st_shndx = SHN_UNDEF; + init_sym->st_value = 0; + init_sym->st_size = 0; + + return 0; +} + +int bpf_linker__add_file(struct bpf_linker *linker, const char *filename) +{ + struct src_obj obj = {}; + int err = 0; + + if (!linker->elf) + return -EINVAL; + + err = err ?: linker_load_obj_file(linker, filename, &obj); + err = err ?: linker_append_sec_data(linker, &obj); + err = err ?: linker_append_elf_syms(linker, &obj); + err = err ?: linker_append_elf_relos(linker, &obj); + err = err ?: linker_append_btf(linker, &obj); + err = err ?: linker_append_btf_ext(linker, &obj); + + /* free up src_obj resources */ + free(obj.btf_type_map); + btf__free(obj.btf); + btf_ext__free(obj.btf_ext); + free(obj.secs); + free(obj.sym_map); + if (obj.elf) + elf_end(obj.elf); + if (obj.fd >= 0) + close(obj.fd); + + return err; +} + +static bool is_dwarf_sec_name(const char *name) +{ + /* approximation, but the actual list is too long */ + return strncmp(name, ".debug_", sizeof(".debug_") - 1) == 0; +} + +static bool is_ignored_sec(struct src_sec *sec) +{ + Elf64_Shdr *shdr = sec->shdr; + const char *name = sec->sec_name; + + /* no special handling of .strtab */ + if (shdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (shdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (shdr->sh_type == SHT_PROGBITS && shdr->sh_size == 0 && + strcmp(sec->sec_name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_dwarf_sec_name(sec->sec_name)) + return true; + + if (strncmp(name, ".rel", sizeof(".rel") - 1) == 0) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_dwarf_sec_name(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *secs = obj->secs, *sec; + size_t new_cnt = obj->sec_cnt ? obj->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + obj->sec_cnt, 0, (new_cnt - obj->sec_cnt) * sizeof(*secs)); + + obj->secs = secs; + obj->sec_cnt = new_cnt; + + sec = &obj->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = sec_name; + + return sec; +} + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj) +{ +#if __BYTE_ORDER == __LITTLE_ENDIAN + const int host_endianness = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + const int host_endianness = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER" +#endif + int err = 0; + Elf_Scn *scn; + Elf_Data *data; + Elf64_Ehdr *ehdr; + Elf64_Shdr *shdr; + struct src_sec *sec; + + pr_debug("linker: adding object file '%s'...\n", filename); + + obj->filename = filename; + + obj->fd = open(filename, O_RDONLY); + if (obj->fd < 0) { + err = -errno; + pr_warn("failed to open file '%s': %d\n", filename, err); + return err; + } + obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL); + if (!obj->elf) { + err = -errno; + pr_warn_elf("failed to parse ELF file '%s'", filename); + return err; + } + + /* Sanity check ELF file high-level properties */ + ehdr = elf64_getehdr(obj->elf); + if (!ehdr) { + err = -errno; + pr_warn_elf("failed to get ELF header for %s", filename); + return err; + } + if (ehdr->e_ident[EI_DATA] != host_endianness) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported byte order of ELF file %s", filename); + return err; + } + if (ehdr->e_type != ET_REL + || ehdr->e_machine != EM_BPF + || ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported kind of ELF file %s", filename); + return err; + } + + if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) { + err = -errno; + pr_warn_elf("failed to get SHSTRTAB section index for %s", filename); + return err; + } + + scn = NULL; + while ((scn = elf_nextscn(obj->elf, scn)) != NULL) { + size_t sec_idx = elf_ndxscn(scn); + const char *sec_name; + + shdr = elf64_getshdr(scn); + if (!shdr) { + err = -errno; + pr_warn_elf("failed to get section #%zu header for %s", + sec_idx, filename); + return err; + } + + sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name); + if (!sec_name) { + err = -errno; + pr_warn_elf("failed to get section #%zu name for %s", + sec_idx, filename); + return err; + } + + data = elf_getdata(scn, 0); + if (!data) { + err = -errno; + pr_warn_elf("failed to get section #%zu (%s) data from %s", + sec_idx, sec_name, filename); + return err; + } + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->scn = scn; + sec->shdr = shdr; + sec->data = data; + sec->sec_idx = elf_ndxscn(scn); + + if (is_ignored_sec(sec)) { + sec->skipped = true; + continue; + } + + switch (shdr->sh_type) { + case SHT_SYMTAB: + if (obj->symtab_sec_idx) { + err = -EOPNOTSUPP; + pr_warn("multiple SYMTAB sections found, not supported\n"); + return err; + } + obj->symtab_sec_idx = sec_idx; + break; + case SHT_STRTAB: + /* we'll construct our own string table */ + break; + case SHT_PROGBITS: + if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + obj->btf = btf__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf); + if (err) { + pr_warn("failed to parse .BTF from %s: %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + + /* data & code */ + break; + case SHT_NOBITS: + /* BSS */ + break; + case SHT_REL: + /* relocations */ + break; + default: + pr_warn("unrecognized section #%zu (%s) in %s\n", + sec_idx, sec_name, filename); + err = -EINVAL; + return err; + } + } + + err = err ?: linker_sanity_check_elf(obj); + err = err ?: linker_sanity_check_btf(obj); + err = err ?: linker_sanity_check_btf_ext(obj); + err = err ?: linker_fixup_btf(obj); + + return err; +} + +static bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +static int linker_sanity_check_elf(struct src_obj *obj) +{ + struct src_sec *sec; + int i, err; + + if (!obj->symtab_sec_idx) { + pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); + return -EINVAL; + } + if (!obj->shstrs_sec_idx) { + pr_warn("ELF is missing section headers STRTAB section in %s\n", obj->filename); + return -EINVAL; + } + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (sec->sec_name[0] == '\0') { + pr_warn("ELF section #%zu has empty name in %s\n", sec->sec_idx, obj->filename); + return -EINVAL; + } + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + return -EINVAL; + if (sec->shdr->sh_addralign != sec->data->d_align) + return -EINVAL; + + if (sec->shdr->sh_size != sec->data->d_size) + return -EINVAL; + + switch (sec->shdr->sh_type) { + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; + break; + case SHT_STRTAB: + break; + case SHT_PROGBITS: + if (sec->shdr->sh_flags & SHF_EXECINSTR) { + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + return -EINVAL; + } + break; + case SHT_NOBITS: + break; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } + + return 0; +} + +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); + return -EINVAL; + } + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); + + return -EINVAL; + } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (sym_type == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + continue; + } + } + + return 0; +} + +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel<secname> -> <secname> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; + } + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } + } + + return 0; +} + +static int check_btf_type_id(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (*type_id > btf__get_nr_types(btf)) + return -EINVAL; + + return 0; +} + +static int check_btf_str_off(__u32 *str_off, void *ctx) +{ + struct btf *btf = ctx; + const char *s; + + s = btf__str_by_offset(btf, *str_off); + + if (!s) + return -EINVAL; + + return 0; +} + +static int linker_sanity_check_btf(struct src_obj *obj) +{ + struct btf_type *t; + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__get_nr_types(obj->btf); + for (i = 1; i <= n; i++) { + t = btf_type_by_id(obj->btf, i); + + err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); + err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + if (err) + return err; + } + + return 0; +} + +static int linker_sanity_check_btf_ext(struct src_obj *obj) +{ + int err = 0; + + if (!obj->btf_ext) + return 0; + + /* can't use .BTF.ext without .BTF */ + if (!obj->btf) + return -EINVAL; + + err = err ?: btf_ext_visit_type_ids(obj->btf_ext, check_btf_type_id, obj->btf); + err = err ?: btf_ext_visit_str_offs(obj->btf_ext, check_btf_str_off, obj->btf); + if (err) + return err; + + return 0; +} + +static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + dst_sec->sec_sz = 0; + dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; + + /* ephemeral sections are just thin section shells lacking most parts */ + if (src_sec->ephemeral) + return 0; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -ENOMEM; + + dst_sec->scn = scn; + dst_sec->shdr = shdr; + dst_sec->data = data; + dst_sec->sec_idx = elf_ndxscn(scn); + + name_off = strset__add_str(linker->strtab_strs, src_sec->sec_name); + if (name_off < 0) + return name_off; + + shdr->sh_name = name_off; + shdr->sh_type = src_sec->shdr->sh_type; + shdr->sh_flags = src_sec->shdr->sh_flags; + shdr->sh_size = 0; + /* sh_link and sh_info have different meaning for different types of + * sections, so we leave it up to the caller code to fill them in, if + * necessary + */ + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = src_sec->shdr->sh_addralign; + shdr->sh_entsize = src_sec->shdr->sh_entsize; + + data->d_type = src_sec->data->d_type; + data->d_size = 0; + data->d_buf = NULL; + data->d_align = src_sec->data->d_align; + data->d_off = 0; + + return 0; +} + +static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *sec; + int i; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static bool secs_match(struct dst_sec *dst, struct src_sec *src) +{ + if (dst->ephemeral || src->ephemeral) + return true; + + if (dst->shdr->sh_type != src->shdr->sh_type) { + pr_warn("sec %s types mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_flags != src->shdr->sh_flags) { + pr_warn("sec %s flags mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_entsize != src->shdr->sh_entsize) { + pr_warn("sec %s entsize mismatch\n", dst->sec_name); + return false; + } + + return true; +} + +static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + if (dst_sec->sec_sz != src_sec->shdr->sh_size) + return false; + if (memcmp(dst_sec->raw_data, src_sec->data->d_buf, dst_sec->sec_sz) != 0) + return false; + return true; +} + +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) +{ + void *tmp; + size_t dst_align, src_align; + size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; + if (dst_align == 0) + dst_align = 1; + if (dst_align < src_align) + dst_align = src_align; + + dst_align_sz = (dst->sec_sz + dst_align - 1) / dst_align * dst_align; + + /* no need to re-align final size */ + dst_final_sz = dst_align_sz + src->shdr->sh_size; + + if (src->shdr->sh_type != SHT_NOBITS) { + tmp = realloc(dst->raw_data, dst_final_sz); + if (!tmp) + return -ENOMEM; + dst->raw_data = tmp; + + /* pad dst section, if it's alignment forced size increase */ + memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz); + /* now copy src data at a properly aligned offset */ + memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size); + } + + dst->sec_sz = dst_final_sz; + dst->shdr->sh_size = dst_final_sz; + dst->data->d_size = dst_final_sz; + + dst->shdr->sh_addralign = dst_align; + dst->data->d_align = dst_align; + + src->dst_off = dst_align_sz; + + return 0; +} + +static bool is_data_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + /* ephemeral sections are data sections, e.g., .kconfig, .ksyms */ + if (sec->ephemeral) + return true; + return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; +} + +static bool is_relo_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped || sec->ephemeral) + return false; + return sec->shdr->sh_type == SHT_REL; +} + +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj) +{ + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + + src_sec = &obj->secs[i]; + if (!is_data_sec(src_sec)) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else { + if (!secs_match(dst_sec, src_sec)) { + pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name); + return -1; + } + + /* "license" and "version" sections are deduped */ + if (strcmp(src_sec->sec_name, "license") == 0 + || strcmp(src_sec->sec_name, "version") == 0) { + if (!sec_content_is_same(dst_sec, src_sec)) { + pr_warn("non-identical contents of section '%s' are not supported\n", src_sec->sec_name); + return -EINVAL; + } + src_sec->skipped = true; + src_sec->dst_id = dst_sec->id; + continue; + } + } + + /* record mapped section index */ + src_sec->dst_id = dst_sec->id; + + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + } + + return 0; +} + +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; + int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; + + obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); + if (!obj->sym_map) + return -ENOMEM; + + for (i = 0; i < n; i++, sym++) { + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) + continue; + + sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!sym_name) { + pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); + return -EINVAL; + } + + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; + } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } + + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) + continue; + + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) + continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) + continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; + } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; + } + } + + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } + + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} + +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= 0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } + + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); + + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; + } + + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; + } + + return 0; +} + +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; + struct dst_sec *dst_symtab = &linker->secs[linker->symtab_sec_idx]; + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec, *src_linked_sec; + struct dst_sec *dst_sec, *dst_linked_sec; + Elf64_Rel *src_rel, *dst_rel; + int j, n; + + src_sec = &obj->secs[i]; + if (!is_relo_sec(src_sec)) + continue; + + /* shdr->sh_info points to relocatable section */ + src_linked_sec = &obj->secs[src_sec->shdr->sh_info]; + if (src_linked_sec->skipped) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else if (!secs_match(dst_sec, src_sec)) { + pr_warn("sections %s are not compatible\n", src_sec->sec_name); + return -1; + } + + /* shdr->sh_link points to SYMTAB */ + dst_sec->shdr->sh_link = linker->symtab_sec_idx; + + /* shdr->sh_info points to relocated section */ + dst_linked_sec = &linker->secs[src_linked_sec->dst_id]; + dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; + + src_sec->dst_id = dst_sec->id; + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + + src_rel = src_sec->data->d_buf; + dst_rel = dst_sec->raw_data + src_sec->dst_off; + n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; + for (j = 0; j < n; j++, src_rel++, dst_rel++) { + size_t src_sym_idx = ELF64_R_SYM(src_rel->r_info); + size_t sym_type = ELF64_R_TYPE(src_rel->r_info); + Elf64_Sym *src_sym, *dst_sym; + size_t dst_sym_idx; + + src_sym_idx = ELF64_R_SYM(src_rel->r_info); + src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; + + dst_sym_idx = obj->sym_map[src_sym_idx]; + dst_sym = dst_symtab->raw_data + sizeof(*dst_sym) * dst_sym_idx; + dst_rel->r_offset += src_linked_sec->dst_off; + sym_type = ELF64_R_TYPE(src_rel->r_info); + dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); + + if (ELF64_ST_TYPE(src_sym->st_info) == STT_SECTION) { + struct src_sec *sec = &obj->secs[src_sym->st_shndx]; + struct bpf_insn *insn; + + if (src_linked_sec->shdr->sh_flags & SHF_EXECINSTR) { + /* calls to the very first static function inside + * .text section at offset 0 will + * reference section symbol, not the + * function symbol. Fix that up, + * otherwise it won't be possible to + * relocate calls to two different + * static functions with the same name + * (rom two different object files) + */ + insn = dst_linked_sec->raw_data + dst_rel->r_offset; + if (insn->code == (BPF_JMP | BPF_CALL)) + insn->imm += sec->dst_off / sizeof(struct bpf_insn); + else + insn->imm += sec->dst_off; + } else { + pr_warn("relocation against STT_SECTION in non-exec section is not supported!\n"); + return -EINVAL; + } + } + + } + } + + return 0; +} + +static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, + int sym_type, const char *sym_name) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + const char *name; + + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != sym_type) + continue; + + name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!name) + return NULL; + + if (strcmp(sym_name, name) != 0) + continue; + + return sym; + } + + return NULL; +} + +static int linker_fixup_btf(struct src_obj *obj) +{ + const char *sec_name; + struct src_sec *sec; + int i, j, n, m; + + if (!obj->btf) + return 0; + + n = btf__get_nr_types(obj->btf); + for (i = 1; i <= n; i++) { + struct btf_var_secinfo *vi; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (btf_kind(t) != BTF_KIND_DATASEC) + continue; + + sec_name = btf__str_by_offset(obj->btf, t->name_off); + sec = find_src_sec_by_name(obj, sec_name); + if (sec) { + /* record actual section size, unless ephemeral */ + if (sec->shdr) + t->size = sec->shdr->sh_size; + } else { + /* BTF can have some sections that are not represented + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->ephemeral = true; + sec->sec_idx = 0; /* will match UNDEF shndx in ELF */ + } + + /* remember ELF section and its BTF type ID match */ + sec->sec_type_id = i; + + /* fix up variable offsets */ + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); + const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); + int var_linkage = btf_var(vt)->linkage; + Elf64_Sym *sym; + + /* no need to patch up static or extern vars */ + if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) + continue; + + sym = find_sym_by_name(obj, sec->sec_idx, STT_OBJECT, var_name); + if (!sym) { + pr_warn("failed to find symbol for variable '%s' in section '%s'\n", var_name, sec_name); + return -ENOENT; + } + + vi->offset = sym->st_value; + } + } + + return 0; +} + +static int remap_type_id(__u32 *type_id, void *ctx) +{ + int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } + + *type_id = id_map[*type_id]; + + return 0; +} + +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_type *t; + int i, j, n, start_id, id; + const char *name; + + if (!obj->btf) + return 0; + + start_id = btf__get_nr_types(linker->btf) + 1; + n = btf__get_nr_types(obj->btf); + + obj->btf_type_map = calloc(n + 1, sizeof(int)); + if (!obj->btf_type_map) + return -ENOMEM; + + for (i = 1; i <= n; i++) { + struct glob_sym *glob_sym = NULL; + + t = btf__type_by_id(obj->btf, i); + + /* DATASECs are handled specially below */ + if (btf_kind(t) == BTF_KIND_DATASEC) + continue; + + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + + id = btf__add_type(linker->btf, obj->btf, t); + if (id < 0) { + pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); + return id; + } + + obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } + } + + /* remap all the types except DATASECs */ + n = btf__get_nr_types(linker->btf); + for (i = start_id; i <= n; i++) { + struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + + if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) + return -EINVAL; + } + + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + + /* append DATASEC info */ + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + const struct btf_var_secinfo *src_var; + struct btf_var_secinfo *dst_var; + + src_sec = &obj->secs[i]; + if (!src_sec->sec_type_id || src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* Mark section as having BTF regardless of the presence of + * variables. In some cases compiler might generate empty BTF + * with no variables information. E.g., when promoting local + * array/structure variable initial values and BPF object + * file otherwise has no read-only static variables in + * .rodata. We need to preserve such empty BTF and just set + * correct section size. + */ + dst_sec->has_btf = true; + + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); + src_var = btf_var_secinfos(t); + n = btf_vlen(t); + for (j = 0; j < n; j++, src_var++) { + void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } + + sec_vars = libbpf_reallocarray(sec_vars, + dst_sec->sec_var_cnt + 1, + sizeof(*dst_sec->sec_vars)); + if (!sec_vars) + return -ENOMEM; + + dst_sec->sec_vars = sec_vars; + dst_sec->sec_var_cnt++; + + dst_var = &dst_sec->sec_vars[dst_sec->sec_var_cnt - 1]; + dst_var->type = obj->btf_type_map[src_var->type]; + dst_var->size = src_var->size; + dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; + } + } + + return 0; +} + +static void *add_btf_ext_rec(struct btf_ext_sec_data *ext_data, const void *src_rec) +{ + void *tmp; + + tmp = libbpf_reallocarray(ext_data->recs, ext_data->rec_cnt + 1, ext_data->rec_sz); + if (!tmp) + return NULL; + ext_data->recs = tmp; + + tmp += ext_data->rec_cnt * ext_data->rec_sz; + memcpy(tmp, src_rec, ext_data->rec_sz); + + ext_data->rec_cnt++; + + return tmp; +} + +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_ext_info_sec *ext_sec; + const char *sec_name, *s; + struct src_sec *src_sec; + struct dst_sec *dst_sec; + int rec_sz, str_off, i; + + if (!obj->btf_ext) + return 0; + + rec_sz = obj->btf_ext->func_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->func_info, ext_sec) { + struct bpf_func_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->func_info.rec_sz == 0) + dst_sec->func_info.rec_sz = rec_sz; + if (dst_sec->func_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->func_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->func_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + } + } + + rec_sz = obj->btf_ext->line_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->line_info, ext_sec) { + struct bpf_line_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->line_info.rec_sz == 0) + dst_sec->line_info.rec_sz = rec_sz; + if (dst_sec->line_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->line_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->line_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + + s = btf__str_by_offset(obj->btf, src_rec->file_name_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->file_name_off = str_off; + + s = btf__str_by_offset(obj->btf, src_rec->line_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->line_off = str_off; + + /* dst_rec->line_col is fine */ + } + } + + rec_sz = obj->btf_ext->core_relo_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->core_relo_info, ext_sec) { + struct bpf_core_relo *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->core_relo_info.rec_sz == 0) + dst_sec->core_relo_info.rec_sz = rec_sz; + if (dst_sec->core_relo_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->core_relo_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->core_relo_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + + s = btf__str_by_offset(obj->btf, src_rec->access_str_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->access_str_off = str_off; + + /* dst_rec->kind is fine */ + } + } + + return 0; +} + +int bpf_linker__finalize(struct bpf_linker *linker) +{ + struct dst_sec *sec; + size_t strs_sz; + const void *strs; + int err, i; + + if (!linker->elf) + return -EINVAL; + + err = finalize_btf(linker); + if (err) + return err; + + /* Finalize strings */ + strs_sz = strset__data_size(linker->strtab_strs); + strs = strset__data(linker->strtab_strs); + + sec = &linker->secs[linker->strtab_sec_idx]; + sec->data->d_align = 1; + sec->data->d_off = 0LL; + sec->data->d_buf = (void *)strs; + sec->data->d_type = ELF_T_BYTE; + sec->data->d_size = strs_sz; + sec->shdr->sh_size = strs_sz; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + /* STRTAB is handled specially above */ + if (sec->sec_idx == linker->strtab_sec_idx) + continue; + + /* special ephemeral sections (.ksyms, .kconfig, etc) */ + if (!sec->scn) + continue; + + sec->data->d_buf = sec->raw_data; + } + + /* Finalize ELF layout */ + if (elf_update(linker->elf, ELF_C_NULL) < 0) { + err = -errno; + pr_warn_elf("failed to finalize ELF layout"); + return err; + } + + /* Write out final ELF contents */ + if (elf_update(linker->elf, ELF_C_WRITE) < 0) { + err = -errno; + pr_warn_elf("failed to write ELF contents"); + return err; + } + + elf_end(linker->elf); + close(linker->fd); + + linker->elf = NULL; + linker->fd = -1; + + return 0; +} + +static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name, + size_t align, const void *raw_data, size_t raw_sz) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + name_off = strset__add_str(linker->strtab_strs, sec_name); + if (name_off < 0) + return name_off; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -EINVAL; + + shdr->sh_name = name_off; + shdr->sh_type = SHT_PROGBITS; + shdr->sh_flags = 0; + shdr->sh_size = raw_sz; + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = align; + shdr->sh_entsize = 0; + + data->d_type = ELF_T_BYTE; + data->d_size = raw_sz; + data->d_buf = (void *)raw_data; + data->d_align = align; + data->d_off = 0; + + return 0; +} + +static int finalize_btf(struct bpf_linker *linker) +{ + struct btf *btf = linker->btf; + const void *raw_data; + int i, j, id, err; + __u32 raw_sz; + + /* bail out if no BTF data was produced */ + if (btf__get_nr_types(linker->btf) == 0) + return 0; + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (!sec->has_btf) + continue; + + id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); + if (id < 0) { + pr_warn("failed to add consolidated BTF type for datasec '%s': %d\n", + sec->sec_name, id); + return id; + } + + for (j = 0; j < sec->sec_var_cnt; j++) { + struct btf_var_secinfo *vi = &sec->sec_vars[j]; + + if (btf__add_datasec_var_info(btf, vi->type, vi->offset, vi->size)) + return -EINVAL; + } + } + + err = finalize_btf_ext(linker); + if (err) { + pr_warn(".BTF.ext generation failed: %d\n", err); + return err; + } + + err = btf__dedup(linker->btf, linker->btf_ext, NULL); + if (err) { + pr_warn("BTF dedup failed: %d\n", err); + return err; + } + + /* Emit .BTF section */ + raw_data = btf__get_raw_data(linker->btf, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF ELF section: %d\n", err); + return err; + } + + /* Emit .BTF.ext section */ + if (linker->btf_ext) { + raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF.ext ELF section: %d\n", err); + return err; + } + } + + return 0; +} + +static int emit_btf_ext_data(struct bpf_linker *linker, void *output, + const char *sec_name, struct btf_ext_sec_data *sec_data) +{ + struct btf_ext_info_sec *sec_info; + void *cur = output; + int str_off; + size_t sz; + + if (!sec_data->rec_cnt) + return 0; + + str_off = btf__add_str(linker->btf, sec_name); + if (str_off < 0) + return -ENOMEM; + + sec_info = cur; + sec_info->sec_name_off = str_off; + sec_info->num_info = sec_data->rec_cnt; + cur += sizeof(struct btf_ext_info_sec); + + sz = sec_data->rec_cnt * sec_data->rec_sz; + memcpy(cur, sec_data->recs, sz); + cur += sz; + + return cur - output; +} + +static int finalize_btf_ext(struct bpf_linker *linker) +{ + size_t funcs_sz = 0, lines_sz = 0, core_relos_sz = 0, total_sz = 0; + size_t func_rec_sz = 0, line_rec_sz = 0, core_relo_rec_sz = 0; + struct btf_ext_header *hdr; + void *data, *cur; + int i, err, sz; + + /* validate that all sections have the same .BTF.ext record sizes + * and calculate total data size for each type of data (func info, + * line info, core relos) + */ + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (sec->func_info.rec_cnt) { + if (func_rec_sz == 0) + func_rec_sz = sec->func_info.rec_sz; + if (func_rec_sz != sec->func_info.rec_sz) { + pr_warn("mismatch in func_info record size %zu != %u\n", + func_rec_sz, sec->func_info.rec_sz); + return -EINVAL; + } + + funcs_sz += sizeof(struct btf_ext_info_sec) + func_rec_sz * sec->func_info.rec_cnt; + } + if (sec->line_info.rec_cnt) { + if (line_rec_sz == 0) + line_rec_sz = sec->line_info.rec_sz; + if (line_rec_sz != sec->line_info.rec_sz) { + pr_warn("mismatch in line_info record size %zu != %u\n", + line_rec_sz, sec->line_info.rec_sz); + return -EINVAL; + } + + lines_sz += sizeof(struct btf_ext_info_sec) + line_rec_sz * sec->line_info.rec_cnt; + } + if (sec->core_relo_info.rec_cnt) { + if (core_relo_rec_sz == 0) + core_relo_rec_sz = sec->core_relo_info.rec_sz; + if (core_relo_rec_sz != sec->core_relo_info.rec_sz) { + pr_warn("mismatch in core_relo_info record size %zu != %u\n", + core_relo_rec_sz, sec->core_relo_info.rec_sz); + return -EINVAL; + } + + core_relos_sz += sizeof(struct btf_ext_info_sec) + core_relo_rec_sz * sec->core_relo_info.rec_cnt; + } + } + + if (!funcs_sz && !lines_sz && !core_relos_sz) + return 0; + + total_sz += sizeof(struct btf_ext_header); + if (funcs_sz) { + funcs_sz += sizeof(__u32); /* record size prefix */ + total_sz += funcs_sz; + } + if (lines_sz) { + lines_sz += sizeof(__u32); /* record size prefix */ + total_sz += lines_sz; + } + if (core_relos_sz) { + core_relos_sz += sizeof(__u32); /* record size prefix */ + total_sz += core_relos_sz; + } + + cur = data = calloc(1, total_sz); + if (!data) + return -ENOMEM; + + hdr = cur; + hdr->magic = BTF_MAGIC; + hdr->version = BTF_VERSION; + hdr->flags = 0; + hdr->hdr_len = sizeof(struct btf_ext_header); + cur += sizeof(struct btf_ext_header); + + /* All offsets are in bytes relative to the end of this header */ + hdr->func_info_off = 0; + hdr->func_info_len = funcs_sz; + hdr->line_info_off = funcs_sz; + hdr->line_info_len = lines_sz; + hdr->core_relo_off = funcs_sz + lines_sz; + hdr->core_relo_len = core_relos_sz; + + if (funcs_sz) { + *(__u32 *)cur = func_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (lines_sz) { + *(__u32 *)cur = line_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (core_relos_sz) { + *(__u32 *)cur = core_relo_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + linker->btf_ext = btf_ext__new(data, total_sz); + err = libbpf_get_error(linker->btf_ext); + if (err) { + linker->btf_ext = NULL; + pr_warn("failed to parse final .BTF.ext data: %d\n", err); + goto out; + } + +out: + free(data); + return err; +} diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 4dd73de00b6f..d2cb28e9ef52 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -40,7 +40,7 @@ static int libbpf_netlink_open(__u32 *nl_pid) memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (sock < 0) return -errno; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 8caaafe7e312..e7a8d847161f 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -227,7 +227,7 @@ static int ringbuf_process_ring(struct ring* r) if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; err = r->sample_cb(r->ctx, sample, len); - if (err) { + if (err < 0) { /* update consumer pos and bail out */ smp_store_release(r->consumer_pos, cons_pos); diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c new file mode 100644 index 000000000000..1fb8b49de1d6 --- /dev/null +++ b/tools/lib/bpf/strset.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <linux/err.h> +#include "hashmap.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct strset { + void *strs_data; + size_t strs_data_len; + size_t strs_data_cap; + size_t strs_data_max_len; + + /* lookup index for each unique string in strings set */ + struct hashmap *strs_hash; +}; + +static size_t strset_hash_fn(const void *key, void *ctx) +{ + const struct strset *s = ctx; + const char *str = s->strs_data + (long)key; + + return str_hash(str); +} + +static bool strset_equal_fn(const void *key1, const void *key2, void *ctx) +{ + const struct strset *s = ctx; + const char *str1 = s->strs_data + (long)key1; + const char *str2 = s->strs_data + (long)key2; + + return strcmp(str1, str2) == 0; +} + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz) +{ + struct strset *set = calloc(1, sizeof(*set)); + struct hashmap *hash; + int err = -ENOMEM; + + if (!set) + return ERR_PTR(-ENOMEM); + + hash = hashmap__new(strset_hash_fn, strset_equal_fn, set); + if (IS_ERR(hash)) + goto err_out; + + set->strs_data_max_len = max_data_sz; + set->strs_hash = hash; + + if (init_data) { + long off; + + set->strs_data = malloc(init_data_sz); + if (!set->strs_data) + goto err_out; + + memcpy(set->strs_data, init_data, init_data_sz); + set->strs_data_len = init_data_sz; + set->strs_data_cap = init_data_sz; + + for (off = 0; off < set->strs_data_len; off += strlen(set->strs_data + off) + 1) { + /* hashmap__add() returns EEXIST if string with the same + * content already is in the hash map + */ + err = hashmap__add(hash, (void *)off, (void *)off); + if (err == -EEXIST) + continue; /* duplicate */ + if (err) + goto err_out; + } + } + + return set; +err_out: + strset__free(set); + return ERR_PTR(err); +} + +void strset__free(struct strset *set) +{ + if (IS_ERR_OR_NULL(set)) + return; + + hashmap__free(set->strs_hash); + free(set->strs_data); +} + +size_t strset__data_size(const struct strset *set) +{ + return set->strs_data_len; +} + +const char *strset__data(const struct strset *set) +{ + return set->strs_data; +} + +static void *strset_add_str_mem(struct strset *set, size_t add_sz) +{ + return libbpf_add_mem(&set->strs_data, &set->strs_data_cap, 1, + set->strs_data_len, set->strs_data_max_len, add_sz); +} + +/* Find string offset that corresponds to a given string *s*. + * Returns: + * - >0 offset into string data, if string is found; + * - -ENOENT, if string is not in the string data; + * - <0, on any other error. + */ +int strset__find_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + + /* see strset__add_str() for why we do this */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + if (hashmap__find(set->strs_hash, (void *)new_off, (void **)&old_off)) + return old_off; + + return -ENOENT; +} + +/* Add a string s to the string data. If the string already exists, return its + * offset within string data. + * Returns: + * - > 0 offset into string data, on success; + * - < 0, on error. + */ +int strset__add_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + int err; + + /* Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(set->strs_hash, (void *)new_off, (void *)new_off, + HASHMAP_ADD, (const void **)&old_off, NULL); + if (err == -EEXIST) + return old_off; /* duplicated string, return existing offset */ + if (err) + return err; + + set->strs_data_len += len; /* new unique string, adjust data length */ + return new_off; +} diff --git a/tools/lib/bpf/strset.h b/tools/lib/bpf/strset.h new file mode 100644 index 000000000000..b6ddf77a83c2 --- /dev/null +++ b/tools/lib/bpf/strset.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* Copyright (c) 2021 Facebook */ +#ifndef __LIBBPF_STRSET_H +#define __LIBBPF_STRSET_H + +#include <stdbool.h> +#include <stddef.h> + +struct strset; + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz); +void strset__free(struct strset *set); + +const char *strset__data(const struct strset *set); +size_t strset__data_size(const struct strset *set); + +int strset__find_str(struct strset *set, const char *s); +int strset__add_str(struct strset *set, const char *s); + +#endif /* __LIBBPF_STRSET_H */ diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 526fc35c0b23..6061431ee04c 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -28,6 +28,7 @@ #include <sys/mman.h> #include <sys/socket.h> #include <sys/types.h> +#include <linux/if_link.h> #include "bpf.h" #include "libbpf.h" @@ -59,6 +60,8 @@ struct xsk_umem { int fd; int refcount; struct list_head ctx_list; + bool rx_ring_setup_done; + bool tx_ring_setup_done; }; struct xsk_ctx { @@ -70,8 +73,10 @@ struct xsk_ctx { int ifindex; struct list_head list; int prog_fd; + int link_fd; int xsks_map_fd; char ifname[IFNAMSIZ]; + bool has_bpf_link; }; struct xsk_socket { @@ -409,7 +414,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) static const int log_buf_size = 16 * 1024; struct xsk_ctx *ctx = xsk->ctx; char log_buf[log_buf_size]; - int err, prog_fd; + int prog_fd; /* This is the fallback C-program: * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) @@ -499,14 +504,41 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) return prog_fd; } - err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd, - xsk->config.xdp_flags); + ctx->prog_fd = prog_fd; + return 0; +} + +static int xsk_create_bpf_link(struct xsk_socket *xsk) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int link_fd; + int err; + + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); if (err) { - close(prog_fd); + pr_warn("getting XDP prog id failed\n"); return err; } - ctx->prog_fd = prog_fd; + /* if there's a netlink-based XDP prog loaded on interface, bail out + * and ask user to do the removal by himself + */ + if (prog_id) { + pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); + return -EINVAL; + } + + opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); + + link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); + if (link_fd < 0) { + pr_warn("bpf_link_create failed: %s\n", strerror(errno)); + return link_fd; + } + + ctx->link_fd = link_fd; return 0; } @@ -625,7 +657,6 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) close(fd); } - err = 0; if (ctx->xsks_map_fd == -1) err = -ENOENT; @@ -642,6 +673,98 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk) &xsk->fd, 0); } +static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) +{ + struct bpf_link_info link_info; + __u32 link_len; + __u32 id = 0; + int err; + int fd; + + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + pr_warn("can't get next link: %s\n", strerror(errno)); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); + err = -errno; + break; + } + + link_len = sizeof(struct bpf_link_info); + memset(&link_info, 0, link_len); + err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); + if (err) { + pr_warn("can't get link info: %s\n", strerror(errno)); + close(fd); + break; + } + if (link_info.type == BPF_LINK_TYPE_XDP) { + if (link_info.xdp.ifindex == ifindex) { + *link_fd = fd; + if (prog_id) + *prog_id = link_info.prog_id; + break; + } + } + close(fd); + } + + return err; +} + +static bool xsk_probe_bpf_link(void) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, + .flags = XDP_FLAGS_SKB_MODE); + struct bpf_load_program_attr prog_attr; + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), + BPF_EXIT_INSN() + }; + int prog_fd, link_fd = -1; + int ifindex_lo = 1; + bool ret = false; + int err; + + err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); + if (err) + return ret; + + if (link_fd >= 0) + return true; + + memset(&prog_attr, 0, sizeof(prog_attr)); + prog_attr.prog_type = BPF_PROG_TYPE_XDP; + prog_attr.insns = insns; + prog_attr.insns_cnt = ARRAY_SIZE(insns); + prog_attr.license = "GPL"; + + prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0); + if (prog_fd < 0) + return ret; + + link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); + close(prog_fd); + + if (link_fd >= 0) { + ret = true; + close(link_fd); + } + + return ret; +} + static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) { char ifname[IFNAMSIZ]; @@ -663,64 +786,108 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) ctx->ifname[IFNAMSIZ - 1] = 0; xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); return 0; } -static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, - int *xsks_map_fd) +static int xsk_init_xdp_res(struct xsk_socket *xsk, + int *xsks_map_fd) { - struct xsk_socket *xsk = _xdp; struct xsk_ctx *ctx = xsk->ctx; - __u32 prog_id = 0; int err; - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, - xsk->config.xdp_flags); + err = xsk_create_bpf_maps(xsk); if (err) return err; - if (!prog_id) { - err = xsk_create_bpf_maps(xsk); - if (err) - return err; + err = xsk_load_xdp_prog(xsk); + if (err) + goto err_load_xdp_prog; - err = xsk_load_xdp_prog(xsk); - if (err) { - goto err_load_xdp_prog; - } - } else { - ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); - if (ctx->prog_fd < 0) - return -errno; - err = xsk_lookup_bpf_maps(xsk); - if (err) { - close(ctx->prog_fd); - return err; - } - } + if (ctx->has_bpf_link) + err = xsk_create_bpf_link(xsk); + else + err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd, + xsk->config.xdp_flags); - if (xsk->rx) { - err = xsk_set_bpf_maps(xsk); - if (err) { - if (!prog_id) { - goto err_set_bpf_maps; - } else { - close(ctx->prog_fd); - return err; - } - } - } - if (xsks_map_fd) - *xsks_map_fd = ctx->xsks_map_fd; + if (err) + goto err_attach_xdp_prog; - return 0; + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_bpf_maps; + + return err; err_set_bpf_maps: + if (ctx->has_bpf_link) + close(ctx->link_fd); + else + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); +err_attach_xdp_prog: close(ctx->prog_fd); - bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); err_load_xdp_prog: xsk_delete_bpf_maps(xsk); + return err; +} + +static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) +{ + struct xsk_ctx *ctx = xsk->ctx; + int err; + + ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (ctx->prog_fd < 0) { + err = -errno; + goto err_prog_fd; + } + err = xsk_lookup_bpf_maps(xsk); + if (err) + goto err_lookup_maps; + + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_maps; + + return err; + +err_set_maps: + close(ctx->xsks_map_fd); +err_lookup_maps: + close(ctx->prog_fd); +err_prog_fd: + if (ctx->has_bpf_link) + close(ctx->link_fd); + return err; +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) +{ + struct xsk_socket *xsk = _xdp; + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int err; + + if (ctx->has_bpf_link) + err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); + else + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); + + if (err) + return err; + + err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : + xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); + + if (!err && xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; return err; } @@ -743,26 +910,30 @@ static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, return NULL; } -static void xsk_put_ctx(struct xsk_ctx *ctx) +static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) { struct xsk_umem *umem = ctx->umem; struct xdp_mmap_offsets off; int err; - if (--ctx->refcount == 0) { - err = xsk_get_mmap_offsets(umem->fd, &off); - if (!err) { - munmap(ctx->fill->ring - off.fr.desc, - off.fr.desc + umem->config.fill_size * - sizeof(__u64)); - munmap(ctx->comp->ring - off.cr.desc, - off.cr.desc + umem->config.comp_size * - sizeof(__u64)); - } + if (--ctx->refcount) + return; - list_del(&ctx->list); - free(ctx); - } + if (!unmap) + goto out_free; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (err) + goto out_free; + + munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * + sizeof(__u64)); + munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * + sizeof(__u64)); + +out_free: + list_del(&ctx->list); + free(ctx); } static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, @@ -797,8 +968,6 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, memcpy(ctx->ifname, ifname, IFNAMSIZ - 1); ctx->ifname[IFNAMSIZ - 1] = '\0'; - umem->fill_save = NULL; - umem->comp_save = NULL; ctx->fill = fill; ctx->comp = comp; list_add(&ctx->list, &umem->ctx_list); @@ -848,6 +1017,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, struct xsk_ring_cons *comp, const struct xsk_socket_config *usr_config) { + bool unmap, rx_setup_done = false, tx_setup_done = false; void *rx_map = NULL, *tx_map = NULL; struct sockaddr_xdp sxdp = {}; struct xdp_mmap_offsets off; @@ -858,6 +1028,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, if (!umem || !xsk_ptr || !(rx || tx)) return -EFAULT; + unmap = umem->fill_save != fill; + xsk = calloc(1, sizeof(*xsk)); if (!xsk) return -ENOMEM; @@ -881,6 +1053,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } else { xsk->fd = umem->fd; + rx_setup_done = umem->rx_ring_setup_done; + tx_setup_done = umem->tx_ring_setup_done; } ctx = xsk_get_ctx(umem, ifindex, queue_id); @@ -898,8 +1072,9 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); - if (rx) { + if (rx && !rx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, &xsk->config.rx_size, sizeof(xsk->config.rx_size)); @@ -907,8 +1082,10 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, err = -errno; goto out_put_ctx; } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; } - if (tx) { + if (tx && !tx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, &xsk->config.tx_size, sizeof(xsk->config.tx_size)); @@ -916,6 +1093,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, err = -errno; goto out_put_ctx; } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; } err = xsk_get_mmap_offsets(xsk->fd, &off); @@ -994,6 +1173,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } *xsk_ptr = xsk; + umem->fill_save = NULL; + umem->comp_save = NULL; return 0; out_mmap_tx: @@ -1005,7 +1186,7 @@ out_mmap_rx: munmap(rx_map, off.rx.desc + xsk->config.rx_size * sizeof(struct xdp_desc)); out_put_ctx: - xsk_put_ctx(ctx); + xsk_put_ctx(ctx, unmap); out_socket: if (--umem->refcount) close(xsk->fd); @@ -1019,6 +1200,9 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *usr_config) { + if (!umem) + return -EFAULT; + return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, rx, tx, umem->fill_save, umem->comp_save, usr_config); @@ -1054,6 +1238,8 @@ void xsk_socket__delete(struct xsk_socket *xsk) if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); + if (ctx->has_bpf_link) + close(ctx->link_fd); } err = xsk_get_mmap_offsets(xsk->fd, &off); @@ -1068,7 +1254,7 @@ void xsk_socket__delete(struct xsk_socket *xsk) } } - xsk_put_ctx(ctx); + xsk_put_ctx(ctx, true); umem->refcount--; /* Do not close an fd that also has an associated umem connected diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index e9f121f5d129..01c12dca9c10 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -3,7 +3,8 @@ /* * AF_XDP user-space access library. * - * Copyright(c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2019 Facebook * * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> */ @@ -13,15 +14,80 @@ #include <stdio.h> #include <stdint.h> +#include <stdbool.h> #include <linux/if_xdp.h> #include "libbpf.h" -#include "libbpf_util.h" #ifdef __cplusplus extern "C" { #endif +/* Load-Acquire Store-Release barriers used by the XDP socket + * library. The following macros should *NOT* be considered part of + * the xsk.h API, and is subject to change anytime. + * + * LIBRARY INTERNAL + */ + +#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#if defined(__i386__) || defined(__x86_64__) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) +#elif defined(__aarch64__) +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + ___p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) +#endif + +/* LIBRARY INTERNAL -- END */ + /* Do not access these members directly. Use the functions below. */ #define DEFINE_XSK_RING(name) \ struct name { \ @@ -96,7 +162,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) * this function. Without this optimization it whould have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ - r->cached_cons = *r->consumer + r->size; + r->cached_cons = libbpf_smp_load_acquire(r->consumer); + r->cached_cons += r->size; return r->cached_cons - r->cached_prod; } @@ -106,7 +173,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) __u32 entries = r->cached_prod - r->cached_cons; if (entries == 0) { - r->cached_prod = *r->producer; + r->cached_prod = libbpf_smp_load_acquire(r->producer); entries = r->cached_prod - r->cached_cons; } @@ -129,9 +196,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. */ - libbpf_smp_wmb(); - - *prod->producer += nb; + libbpf_smp_store_release(prod->producer, *prod->producer + nb); } static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) @@ -139,11 +204,6 @@ static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __ __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { - /* Make sure we do not speculatively read the data before - * we have received the packet buffers from the ring. - */ - libbpf_smp_rmb(); - *idx = cons->cached_cons; cons->cached_cons += entries; } @@ -161,9 +221,8 @@ static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. */ - libbpf_smp_rwmb(); + libbpf_smp_store_release(cons->consumer, *cons->consumer + nb); - *cons->consumer += nb; } static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) diff --git a/tools/memory-model/Documentation/access-marking.txt b/tools/memory-model/Documentation/access-marking.txt new file mode 100644 index 000000000000..1ab189f51f55 --- /dev/null +++ b/tools/memory-model/Documentation/access-marking.txt @@ -0,0 +1,479 @@ +MARKING SHARED-MEMORY ACCESSES +============================== + +This document provides guidelines for marking intentionally concurrent +normal accesses to shared memory, that is "normal" as in accesses that do +not use read-modify-write atomic operations. It also describes how to +document these accesses, both with comments and with special assertions +processed by the Kernel Concurrency Sanitizer (KCSAN). This discussion +builds on an earlier LWN article [1]. + + +ACCESS-MARKING OPTIONS +====================== + +The Linux kernel provides the following access-marking options: + +1. Plain C-language accesses (unmarked), for example, "a = b;" + +2. Data-race marking, for example, "data_race(a = b);" + +3. READ_ONCE(), for example, "a = READ_ONCE(b);" + The various forms of atomic_read() also fit in here. + +4. WRITE_ONCE(), for example, "WRITE_ONCE(a, b);" + The various forms of atomic_set() also fit in here. + + +These may be used in combination, as shown in this admittedly improbable +example: + + WRITE_ONCE(a, b + data_race(c + d) + READ_ONCE(e)); + +Neither plain C-language accesses nor data_race() (#1 and #2 above) place +any sort of constraint on the compiler's choice of optimizations [2]. +In contrast, READ_ONCE() and WRITE_ONCE() (#3 and #4 above) restrict the +compiler's use of code-motion and common-subexpression optimizations. +Therefore, if a given access is involved in an intentional data race, +using READ_ONCE() for loads and WRITE_ONCE() for stores is usually +preferable to data_race(), which in turn is usually preferable to plain +C-language accesses. + +KCSAN will complain about many types of data races involving plain +C-language accesses, but marking all accesses involved in a given data +race with one of data_race(), READ_ONCE(), or WRITE_ONCE(), will prevent +KCSAN from complaining. Of course, lack of KCSAN complaints does not +imply correct code. Therefore, please take a thoughtful approach +when responding to KCSAN complaints. Churning the code base with +ill-considered additions of data_race(), READ_ONCE(), and WRITE_ONCE() +is unhelpful. + +In fact, the following sections describe situations where use of +data_race() and even plain C-language accesses is preferable to +READ_ONCE() and WRITE_ONCE(). + + +Use of the data_race() Macro +---------------------------- + +Here are some situations where data_race() should be used instead of +READ_ONCE() and WRITE_ONCE(): + +1. Data-racy loads from shared variables whose values are used only + for diagnostic purposes. + +2. Data-racy reads whose values are checked against marked reload. + +3. Reads whose values feed into error-tolerant heuristics. + +4. Writes setting values that feed into error-tolerant heuristics. + + +Data-Racy Reads for Approximate Diagnostics + +Approximate diagnostics include lockdep reports, monitoring/statistics +(including /proc and /sys output), WARN*()/BUG*() checks whose return +values are ignored, and other situations where reads from shared variables +are not an integral part of the core concurrency design. + +In fact, use of data_race() instead READ_ONCE() for these diagnostic +reads can enable better checking of the remaining accesses implementing +the core concurrency design. For example, suppose that the core design +prevents any non-diagnostic reads from shared variable x from running +concurrently with updates to x. Then using plain C-language writes +to x allows KCSAN to detect reads from x from within regions of code +that fail to exclude the updates. In this case, it is important to use +data_race() for the diagnostic reads because otherwise KCSAN would give +false-positive warnings about these diagnostic reads. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Data-Racy Reads That Are Checked Against Marked Reload + +The values from some reads are not implicitly trusted. They are instead +fed into some operation that checks the full value against a later marked +load from memory, which means that the occasional arbitrarily bogus value +is not a problem. For example, if a bogus value is fed into cmpxchg(), +all that happens is that this cmpxchg() fails, which normally results +in a retry. Unless the race condition that resulted in the bogus value +recurs, this retry will with high probability succeed, so no harm done. + +However, please keep in mind that a data_race() load feeding into +a cmpxchg_relaxed() might still be subject to load fusing on some +architectures. Therefore, it is best to capture the return value from +the failing cmpxchg() for the next iteration of the loop, an approach +that provides the compiler much less scope for mischievous optimizations. +Capturing the return value from cmpxchg() also saves a memory reference +in many cases. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Reads Feeding Into Error-Tolerant Heuristics + +Values from some reads feed into heuristics that can tolerate occasional +errors. Such reads can use data_race(), thus allowing KCSAN to focus on +the other accesses to the relevant shared variables. But please note +that data_race() loads are subject to load fusing, which can result in +consistent errors, which in turn are quite capable of breaking heuristics. +Therefore use of data_race() should be limited to cases where some other +code (such as a barrier() call) will force the occasional reload. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Writes Setting Values Feeding Into Error-Tolerant Heuristics + +The values read into error-tolerant heuristics come from somewhere, +for example, from sysfs. This means that some code in sysfs writes +to this same variable, and these writes can also use data_race(). +After all, if the heuristic can tolerate the occasional bogus value +due to compiler-mangled reads, it can also tolerate the occasional +compiler-mangled write, at least assuming that the proper value is in +place once the write completes. + +Plain C-language stores can also be used for this use case. However, +in kernels built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n, this +will have the disadvantage of causing KCSAN to generate false positives +because KCSAN will have no way of knowing that the resulting data race +was intentional. + + +Use of Plain C-Language Accesses +-------------------------------- + +Here are some example situations where plain C-language accesses should +used instead of READ_ONCE(), WRITE_ONCE(), and data_race(): + +1. Accesses protected by mutual exclusion, including strict locking + and sequence locking. + +2. Initialization-time and cleanup-time accesses. This covers a + wide variety of situations, including the uniprocessor phase of + system boot, variables to be used by not-yet-spawned kthreads, + structures not yet published to reference-counted or RCU-protected + data structures, and the cleanup side of any of these situations. + +3. Per-CPU variables that are not accessed from other CPUs. + +4. Private per-task variables, including on-stack variables, some + fields in the task_struct structure, and task-private heap data. + +5. Any other loads for which there is not supposed to be a concurrent + store to that same variable. + +6. Any other stores for which there should be neither concurrent + loads nor concurrent stores to that same variable. + + But note that KCSAN makes two explicit exceptions to this rule + by default, refraining from flagging plain C-language stores: + + a. No matter what. You can override this default by building + with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n. + + b. When the store writes the value already contained in + that variable. You can override this default by building + with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. + + c. When one of the stores is in an interrupt handler and + the other in the interrupted code. You can override this + default by building with CONFIG_KCSAN_INTERRUPT_WATCHER=y. + +Note that it is important to use plain C-language accesses in these cases, +because doing otherwise prevents KCSAN from detecting violations of your +code's synchronization rules. + + +ACCESS-DOCUMENTATION OPTIONS +============================ + +It is important to comment marked accesses so that people reading your +code, yourself included, are reminded of the synchronization design. +However, it is even more important to comment plain C-language accesses +that are intentionally involved in data races. Such comments are +needed to remind people reading your code, again, yourself included, +of how the compiler has been prevented from optimizing those accesses +into concurrency bugs. + +It is also possible to tell KCSAN about your synchronization design. +For example, ASSERT_EXCLUSIVE_ACCESS(foo) tells KCSAN that any +concurrent access to variable foo by any other CPU is an error, even +if that concurrent access is marked with READ_ONCE(). In addition, +ASSERT_EXCLUSIVE_WRITER(foo) tells KCSAN that although it is OK for there +to be concurrent reads from foo from other CPUs, it is an error for some +other CPU to be concurrently writing to foo, even if that concurrent +write is marked with data_race() or WRITE_ONCE(). + +Note that although KCSAN will call out data races involving either +ASSERT_EXCLUSIVE_ACCESS() or ASSERT_EXCLUSIVE_WRITER() on the one hand +and data_race() writes on the other, KCSAN will not report the location +of these data_race() writes. + + +EXAMPLES +======== + +As noted earlier, the goal is to prevent the compiler from destroying +your concurrent algorithm, to help the human reader, and to inform +KCSAN of aspects of your concurrency design. This section looks at a +few examples showing how this can be done. + + +Lock Protection With Lockless Diagnostic Access +----------------------------------------------- + +For example, suppose a shared variable "foo" is read only while a +reader-writer spinlock is read-held, written only while that same +spinlock is write-held, except that it is also read locklessly for +diagnostic purposes. The code might look as follows: + + int foo; + DEFINE_RWLOCK(foo_rwlock); + + void update_foo(int newval) + { + write_lock(&foo_rwlock); + foo = newval; + do_something(newval); + write_unlock(&foo_rwlock); + } + + int read_foo(void) + { + int ret; + + read_lock(&foo_rwlock); + do_something_else(); + ret = foo; + read_unlock(&foo_rwlock); + return ret; + } + + int read_foo_diagnostic(void) + { + return data_race(foo); + } + +The reader-writer lock prevents the compiler from introducing concurrency +bugs into any part of the main algorithm using foo, which means that +the accesses to foo within both update_foo() and read_foo() can (and +should) be plain C-language accesses. One benefit of making them be +plain C-language accesses is that KCSAN can detect any erroneous lockless +reads from or updates to foo. The data_race() in read_foo_diagnostic() +tells KCSAN that data races are expected, and should be silently +ignored. This data_race() also tells the human reading the code that +read_foo_diagnostic() might sometimes return a bogus value. + +However, please note that your kernel must be built with +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n in order for KCSAN to +detect a buggy lockless write. If you need KCSAN to detect such a +write even if that write did not change the value of foo, you also +need CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. If you need KCSAN to +detect such a write happening in an interrupt handler running on the +same CPU doing the legitimate lock-protected write, you also need +CONFIG_KCSAN_INTERRUPT_WATCHER=y. With some or all of these Kconfig +options set properly, KCSAN can be quite helpful, although it is not +necessarily a full replacement for hardware watchpoints. On the other +hand, neither are hardware watchpoints a full replacement for KCSAN +because it is not always easy to tell hardware watchpoint to conditionally +trap on accesses. + + +Lock-Protected Writes With Lockless Reads +----------------------------------------- + +For another example, suppose a shared variable "foo" is updated only +while holding a spinlock, but is read locklessly. The code might look +as follows: + + int foo; + DEFINE_SPINLOCK(foo_lock); + + void update_foo(int newval) + { + spin_lock(&foo_lock); + WRITE_ONCE(foo, newval); + ASSERT_EXCLUSIVE_WRITER(foo); + do_something(newval); + spin_unlock(&foo_wlock); + } + + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +Because foo is read locklessly, all accesses are marked. The purpose +of the ASSERT_EXCLUSIVE_WRITER() is to allow KCSAN to check for a buggy +concurrent lockless write. + + +Lockless Reads and Writes +------------------------- + +For another example, suppose a shared variable "foo" is both read and +updated locklessly. The code might look as follows: + + int foo; + + int update_foo(int newval) + { + int ret; + + ret = xchg(&foo, newval); + do_something(newval); + return ret; + } + + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +Because foo is accessed locklessly, all accesses are marked. It does +not make sense to use ASSERT_EXCLUSIVE_WRITER() in this case because +there really can be concurrent lockless writers. KCSAN would +flag any concurrent plain C-language reads from foo, and given +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n, also any concurrent plain +C-language writes to foo. + + +Lockless Reads and Writes, But With Single-Threaded Initialization +------------------------------------------------------------------ + +For yet another example, suppose that foo is initialized in a +single-threaded manner, but that a number of kthreads are then created +that locklessly and concurrently access foo. Some snippets of this code +might look as follows: + + int foo; + + void initialize_foo(int initval, int nkthreads) + { + int i; + + foo = initval; + ASSERT_EXCLUSIVE_ACCESS(foo); + for (i = 0; i < nkthreads; i++) + kthread_run(access_foo_concurrently, ...); + } + + /* Called from access_foo_concurrently(). */ + int update_foo(int newval) + { + int ret; + + ret = xchg(&foo, newval); + do_something(newval); + return ret; + } + + /* Also called from access_foo_concurrently(). */ + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +The initialize_foo() uses a plain C-language write to foo because there +are not supposed to be concurrent accesses during initialization. The +ASSERT_EXCLUSIVE_ACCESS() allows KCSAN to flag buggy concurrent unmarked +reads, and the ASSERT_EXCLUSIVE_ACCESS() call further allows KCSAN to +flag buggy concurrent writes, even if: (1) Those writes are marked or +(2) The kernel was built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y. + + +Checking Stress-Test Race Coverage +---------------------------------- + +When designing stress tests it is important to ensure that race conditions +of interest really do occur. For example, consider the following code +fragment: + + int foo; + + int update_foo(int newval) + { + return xchg(&foo, newval); + } + + int xor_shift_foo(int shift, int mask) + { + int old, new, newold; + + newold = data_race(foo); /* Checked by cmpxchg(). */ + do { + old = newold; + new = (old << shift) ^ mask; + newold = cmpxchg(&foo, old, new); + } while (newold != old); + return old; + } + + int read_foo(void) + { + return READ_ONCE(foo); + } + +If it is possible for update_foo(), xor_shift_foo(), and read_foo() to be +invoked concurrently, the stress test should force this concurrency to +actually happen. KCSAN can evaluate the stress test when the above code +is modified to read as follows: + + int foo; + + int update_foo(int newval) + { + ASSERT_EXCLUSIVE_ACCESS(foo); + return xchg(&foo, newval); + } + + int xor_shift_foo(int shift, int mask) + { + int old, new, newold; + + newold = data_race(foo); /* Checked by cmpxchg(). */ + do { + old = newold; + new = (old << shift) ^ mask; + ASSERT_EXCLUSIVE_ACCESS(foo); + newold = cmpxchg(&foo, old, new); + } while (newold != old); + return old; + } + + + int read_foo(void) + { + ASSERT_EXCLUSIVE_ACCESS(foo); + return READ_ONCE(foo); + } + +If a given stress-test run does not result in KCSAN complaints from +each possible pair of ASSERT_EXCLUSIVE_ACCESS() invocations, the +stress test needs improvement. If the stress test was to be evaluated +on a regular basis, it would be wise to place the above instances of +ASSERT_EXCLUSIVE_ACCESS() under #ifdef so that they did not result in +false positives when not evaluating the stress test. + + +REFERENCES +========== + +[1] "Concurrency bugs should fear the big bad data-race detector (part 2)" + https://lwn.net/Articles/816854/ + +[2] "Who's afraid of a big bad optimizing compiler?" + https://lwn.net/Articles/793253/ diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt index b2da6365be63..6f3d16dbf467 100644 --- a/tools/memory-model/Documentation/glossary.txt +++ b/tools/memory-model/Documentation/glossary.txt @@ -19,7 +19,7 @@ Address Dependency: When the address of a later memory access is computed from the value returned by the rcu_dereference() on line 2, the address dependency extends from that rcu_dereference() to that "p->a". In rare cases, optimizing compilers can destroy address - dependencies. Please see Documentation/RCU/rcu_dereference.txt + dependencies. Please see Documentation/RCU/rcu_dereference.rst for more information. See also "Control Dependency" and "Data Dependency". diff --git a/tools/memory-model/Documentation/simple.txt b/tools/memory-model/Documentation/simple.txt index 81e1a0ec5342..4c789ec8334f 100644 --- a/tools/memory-model/Documentation/simple.txt +++ b/tools/memory-model/Documentation/simple.txt @@ -189,7 +189,6 @@ Additional information may be found in these files: Documentation/atomic_t.txt Documentation/atomic_bitops.txt -Documentation/core-api/atomic_ops.rst Documentation/core-api/refcount-vs-atomic.rst Reading code using these primitives is often also quite helpful. diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 549813cff8ab..cedf3ede7545 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -11,22 +11,15 @@ #include "../../../arch/x86/lib/inat.c" #include "../../../arch/x86/lib/insn.c" +#define CONFIG_64BIT 1 +#include <asm/nops.h> + #include <asm/orc_types.h> #include <objtool/check.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> - -static unsigned char op_to_cfi_reg[][2] = { - {CFI_AX, CFI_R8}, - {CFI_CX, CFI_R9}, - {CFI_DX, CFI_R10}, - {CFI_BX, CFI_R11}, - {CFI_SP, CFI_R12}, - {CFI_BP, CFI_R13}, - {CFI_SI, CFI_R14}, - {CFI_DI, CFI_R15}, -}; +#include <arch/elf.h> static int is_x86_64(const struct elf *elf) { @@ -83,6 +76,31 @@ unsigned long arch_jump_destination(struct instruction *insn) return -1; \ else for (list_add_tail(&op->list, ops_list); op; op = NULL) +/* + * Helpers to decode ModRM/SIB: + * + * r/m| AX CX DX BX | SP | BP | SI DI | + * | R8 R9 R10 R11 | R12 | R13 | R14 R15 | + * Mod+----------------+-----+-----+---------+ + * 00 | [r/m] |[SIB]|[IP+]| [r/m] | + * 01 | [r/m + d8] |[S+d]| [r/m + d8] | + * 10 | [r/m + d32] |[S+D]| [r/m + d32] | + * 11 | r/ m | + */ + +#define mod_is_mem() (modrm_mod != 3) +#define mod_is_reg() (modrm_mod == 3) + +#define is_RIP() ((modrm_rm & 7) == CFI_BP && modrm_mod == 0) +#define have_SIB() ((modrm_rm & 7) == CFI_SP && mod_is_mem()) + +#define rm_is(reg) (have_SIB() ? \ + sib_base == (reg) && sib_index == CFI_SP : \ + modrm_rm == (reg)) + +#define rm_is_mem(reg) (mod_is_mem() && !is_RIP() && rm_is(reg)) +#define rm_is_reg(reg) (mod_is_reg() && modrm_rm == (reg)) + int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, @@ -90,21 +108,22 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, struct list_head *ops_list) { struct insn insn; - int x86_64, sign; - unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, - rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, - modrm_reg = 0, sib = 0; + int x86_64, ret; + unsigned char op1, op2, + rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, + modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, + sib = 0, /* sib_scale = 0, */ sib_index = 0, sib_base = 0; struct stack_op *op = NULL; struct symbol *sym; + u64 imm; x86_64 = is_x86_64(elf); if (x86_64 == -1) return -1; - insn_init(&insn, sec->data->d_buf + offset, maxlen, x86_64); - insn_get_length(&insn); - - if (!insn_complete(&insn)) { + ret = insn_decode(&insn, sec->data->d_buf + offset, maxlen, + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + if (ret < 0) { WARN("can't decode instruction at %s:0x%lx", sec->name, offset); return -1; } @@ -129,23 +148,27 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (insn.modrm.nbytes) { modrm = insn.modrm.bytes[0]; modrm_mod = X86_MODRM_MOD(modrm); - modrm_reg = X86_MODRM_REG(modrm); - modrm_rm = X86_MODRM_RM(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r; + modrm_rm = X86_MODRM_RM(modrm) + 8*rex_b; } - if (insn.sib.nbytes) + if (insn.sib.nbytes) { sib = insn.sib.bytes[0]; + /* sib_scale = X86_SIB_SCALE(sib); */ + sib_index = X86_SIB_INDEX(sib) + 8*rex_x; + sib_base = X86_SIB_BASE(sib) + 8*rex_b; + } switch (op1) { case 0x1: case 0x29: - if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { + if (rex_w && rm_is_reg(CFI_SP)) { /* add/sub reg, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_ADD; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG; op->dest.reg = CFI_SP; } @@ -157,7 +180,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* push reg */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->src.reg = (op1 & 0x7) + 8*rex_b; op->dest.type = OP_DEST_PUSH; } @@ -169,7 +192,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, ADD_OP(op) { op->src.type = OP_SRC_POP; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->dest.reg = (op1 & 0x7) + 8*rex_b; } break; @@ -187,12 +210,54 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, *type = INSN_JUMP_CONDITIONAL; break; - case 0x81: - case 0x83: - if (rex != 0x48) + case 0x80 ... 0x83: + /* + * 1000 00sw : mod OP r/m : immediate + * + * s - sign extend immediate + * w - imm8 / imm32 + * + * OP: 000 ADD 100 AND + * 001 OR 101 SUB + * 010 ADC 110 XOR + * 011 SBB 111 CMP + */ + + /* 64bit only */ + if (!rex_w) + break; + + /* %rsp target only */ + if (!rm_is_reg(CFI_SP)) break; - if (modrm == 0xe4) { + imm = insn.immediate.value; + if (op1 & 2) { /* sign extend */ + if (op1 & 1) { /* imm32 */ + imm <<= 32; + imm = (s64)imm >> 32; + } else { /* imm8 */ + imm <<= 56; + imm = (s64)imm >> 56; + } + } + + switch (modrm_reg & 7) { + case 5: + imm = -imm; + /* fallthrough */ + case 0: + /* add/sub imm, %rsp */ + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_SP; + op->src.offset = imm; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } + break; + + case 4: /* and imm, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_AND; @@ -202,53 +267,48 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->dest.reg = CFI_SP; } break; - } - if (modrm == 0xc4) - sign = 1; - else if (modrm == 0xec) - sign = -1; - else + default: + /* WARN ? */ break; - - /* add/sub imm, %rsp */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_SP; - op->src.offset = insn.immediate.value * sign; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; } + break; case 0x89: - if (rex_w && !rex_r && modrm_reg == 4) { + if (!rex_w) + break; - if (modrm_mod == 3) { + if (modrm_reg == CFI_SP) { + + if (mod_is_reg()) { /* mov %rsp, reg */ ADD_OP(op) { op->src.type = OP_SRC_REG; op->src.reg = CFI_SP; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->dest.reg = modrm_rm; } break; } else { - /* skip nontrivial SIB */ - if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x)) - break; - /* skip RIP relative displacement */ - if (modrm_rm == 5 && modrm_mod == 0) + if (is_RIP()) break; + /* skip nontrivial SIB */ + if (have_SIB()) { + modrm_rm = sib_base; + if (sib_index != CFI_SP) + break; + } + /* mov %rsp, disp(%reg) */ ADD_OP(op) { op->src.type = OP_SRC_REG; op->src.reg = CFI_SP; op->dest.type = OP_DEST_REG_INDIRECT; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->dest.reg = modrm_rm; op->dest.offset = insn.displacement.value; } break; @@ -257,12 +317,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { + if (rm_is_reg(CFI_SP)) { /* mov reg, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG; op->dest.reg = CFI_SP; } @@ -271,13 +331,15 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* fallthrough */ case 0x88: - if (!rex_b && - (modrm_mod == 1 || modrm_mod == 2) && modrm_rm == 5) { + if (!rex_w) + break; + + if (rm_is_mem(CFI_BP)) { /* mov reg, disp(%rbp) */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG_INDIRECT; op->dest.reg = CFI_BP; op->dest.offset = insn.displacement.value; @@ -285,12 +347,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) { + if (rm_is_mem(CFI_SP)) { /* mov reg, disp(%rsp) */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG_INDIRECT; op->dest.reg = CFI_SP; op->dest.offset = insn.displacement.value; @@ -301,7 +363,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x8b: - if (rex_w && !rex_b && modrm_mod == 1 && modrm_rm == 5) { + if (!rex_w) + break; + + if (rm_is_mem(CFI_BP)) { /* mov disp(%rbp), reg */ ADD_OP(op) { @@ -309,11 +374,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.reg = CFI_BP; op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } + break; + } - } else if (rex_w && !rex_b && sib == 0x24 && - modrm_mod != 3 && modrm_rm == 4) { + if (rm_is_mem(CFI_SP)) { /* mov disp(%rsp), reg */ ADD_OP(op) { @@ -321,75 +387,48 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.reg = CFI_SP; op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } + break; } break; case 0x8d: - if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - - ADD_OP(op) { - if (!insn.displacement.value) { - /* lea (%rsp), reg */ - op->src.type = OP_SRC_REG; - } else { - /* lea disp(%rsp), reg */ - op->src.type = OP_SRC_ADD; - op->src.offset = insn.displacement.value; - } - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; - } - - } else if (rex == 0x48 && modrm == 0x65) { - - /* lea disp(%rbp), %rsp */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_BP; - op->src.offset = insn.displacement.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; - } + if (mod_is_reg()) { + WARN("invalid LEA encoding at %s:0x%lx", sec->name, offset); + break; + } - } else if (rex == 0x49 && modrm == 0x62 && - insn.displacement.value == -8) { + /* skip non 64bit ops */ + if (!rex_w) + break; - /* - * lea -0x8(%r10), %rsp - * - * Restoring rsp back to its original value after a - * stack realignment. - */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R10; - op->src.offset = -8; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; - } + /* skip RIP relative displacement */ + if (is_RIP()) + break; - } else if (rex == 0x49 && modrm == 0x65 && - insn.displacement.value == -16) { + /* skip nontrivial SIB */ + if (have_SIB()) { + modrm_rm = sib_base; + if (sib_index != CFI_SP) + break; + } - /* - * lea -0x10(%r13), %rsp - * - * Restoring rsp back to its original value after a - * stack realignment. - */ - ADD_OP(op) { + /* lea disp(%src), %dst */ + ADD_OP(op) { + op->src.offset = insn.displacement.value; + if (!op->src.offset) { + /* lea (%src), %dst */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%src), %dst */ op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R13; - op->src.offset = -16; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; } + op->src.reg = modrm_rm; + op->dest.type = OP_DEST_REG; + op->dest.reg = modrm_reg; } - break; case 0x8f: @@ -476,9 +515,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, * mov bp, sp * pop bp */ - ADD_OP(op) - op->dest.type = OP_DEST_LEAVE; - + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_BP; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } + ADD_OP(op) { + op->src.type = OP_SRC_POP; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_BP; + } break; case 0xe3: @@ -596,11 +643,11 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) const char *arch_nop_insn(int len) { static const char nops[5][5] = { - /* 1 */ { 0x90 }, - /* 2 */ { 0x66, 0x90 }, - /* 3 */ { 0x0f, 0x1f, 0x00 }, - /* 4 */ { 0x0f, 0x1f, 0x40, 0x00 }, - /* 5 */ { 0x0f, 0x1f, 0x44, 0x00, 0x00 }, + { BYTES_NOP1 }, + { BYTES_NOP2 }, + { BYTES_NOP3 }, + { BYTES_NOP4 }, + { BYTES_NOP5 }, }; if (len < 1 || len > 5) { @@ -611,6 +658,122 @@ const char *arch_nop_insn(int len) return nops[len-1]; } +/* asm/alternative.h ? */ + +#define ALTINSTR_FLAG_INV (1 << 15) +#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) + +struct alt_instr { + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ + u16 cpuid; /* cpuid bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction */ +} __packed; + +static int elf_add_alternative(struct elf *elf, + struct instruction *orig, struct symbol *sym, + int cpuid, u8 orig_len, u8 repl_len) +{ + const int size = sizeof(struct alt_instr); + struct alt_instr *alt; + struct section *sec; + Elf_Scn *s; + + sec = find_section_by_name(elf, ".altinstructions"); + if (!sec) { + sec = elf_create_section(elf, ".altinstructions", + SHF_WRITE, size, 0); + + if (!sec) { + WARN_ELF("elf_create_section"); + return -1; + } + } + + s = elf_getscn(elf->elf, sec->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return -1; + } + + sec->data = elf_newdata(s); + if (!sec->data) { + WARN_ELF("elf_newdata"); + return -1; + } + + sec->data->d_size = size; + sec->data->d_align = 1; + + alt = sec->data->d_buf = malloc(size); + if (!sec->data->d_buf) { + perror("malloc"); + return -1; + } + memset(sec->data->d_buf, 0, size); + + if (elf_add_reloc_to_insn(elf, sec, sec->sh.sh_size, + R_X86_64_PC32, orig->sec, orig->offset)) { + WARN("elf_create_reloc: alt_instr::instr_offset"); + return -1; + } + + if (elf_add_reloc(elf, sec, sec->sh.sh_size + 4, + R_X86_64_PC32, sym, 0)) { + WARN("elf_create_reloc: alt_instr::repl_offset"); + return -1; + } + + alt->cpuid = cpuid; + alt->instrlen = orig_len; + alt->replacementlen = repl_len; + + sec->sh.sh_size += size; + sec->changed = true; + + return 0; +} + +#define X86_FEATURE_RETPOLINE ( 7*32+12) + +int arch_rewrite_retpolines(struct objtool_file *file) +{ + struct instruction *insn; + struct reloc *reloc; + struct symbol *sym; + char name[32] = ""; + + list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + + if (!strcmp(insn->sec->name, ".text.__x86.indirect_thunk")) + continue; + + reloc = insn->reloc; + + sprintf(name, "__x86_indirect_alt_%s_%s", + insn->type == INSN_JUMP_DYNAMIC ? "jmp" : "call", + reloc->sym->name + 21); + + sym = find_symbol_by_name(file->elf, name); + if (!sym) { + sym = elf_create_undef_symbol(file->elf, name); + if (!sym) { + WARN("elf_create_undef_symbol"); + return -1; + } + } + + if (elf_add_alternative(file->elf, insn, sym, + ALT_NOT(X86_FEATURE_RETPOLINE), 5, 5)) { + WARN("elf_add_alternative"); + return -1; + } + } + + return 0; +} + int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) { struct cfi_reg *cfa = &insn->cfi.cfa; @@ -646,3 +809,8 @@ int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) return 0; } + +bool arch_is_retpoline(struct symbol *sym) +{ + return !strncmp(sym->name, "__x86_indirect_", 15); +} diff --git a/tools/objtool/arch/x86/include/arch/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h index 79bc517efba8..0579d22c433c 100644 --- a/tools/objtool/arch/x86/include/arch/cfi_regs.h +++ b/tools/objtool/arch/x86/include/arch/cfi_regs.h @@ -4,13 +4,13 @@ #define _OBJTOOL_CFI_REGS_H #define CFI_AX 0 -#define CFI_DX 1 -#define CFI_CX 2 +#define CFI_CX 1 +#define CFI_DX 2 #define CFI_BX 3 -#define CFI_SI 4 -#define CFI_DI 5 -#define CFI_BP 6 -#define CFI_SP 7 +#define CFI_SP 4 +#define CFI_BP 5 +#define CFI_SI 6 +#define CFI_DI 7 #define CFI_R8 8 #define CFI_R9 9 #define CFI_R10 10 diff --git a/tools/objtool/arch/x86/include/arch/special.h b/tools/objtool/arch/x86/include/arch/special.h index d818b2bffa02..14271cca0c74 100644 --- a/tools/objtool/arch/x86/include/arch/special.h +++ b/tools/objtool/arch/x86/include/arch/special.h @@ -10,7 +10,7 @@ #define JUMP_ORIG_OFFSET 0 #define JUMP_NEW_OFFSET 4 -#define ALT_ENTRY_SIZE 13 +#define ALT_ENTRY_SIZE 12 #define ALT_ORIG_OFFSET 0 #define ALT_NEW_OFFSET 4 #define ALT_FEATURE_OFFSET 8 diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index c3a85d8f6c5c..8b38b5d6fec7 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -15,16 +15,23 @@ #include <subcmd/parse-options.h> #include <string.h> +#include <stdlib.h> #include <objtool/builtin.h> #include <objtool/objtool.h> -bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr; +bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, + validate_dup, vmlinux, mcount, noinstr, backup; static const char * const check_usage[] = { "objtool check [<options>] file.o", NULL, }; +static const char * const env_usage[] = { + "OBJTOOL_ARGS=\"<options>\"", + NULL, +}; + const struct option check_options[] = { OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"), OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"), @@ -37,20 +44,44 @@ const struct option check_options[] = { OPT_BOOLEAN('n', "noinstr", &noinstr, "noinstr validation for vmlinux.o"), OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"), OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"), + OPT_BOOLEAN('B', "backup", &backup, "create .orig files before modification"), OPT_END(), }; +int cmd_parse_options(int argc, const char **argv, const char * const usage[]) +{ + const char *envv[16] = { }; + char *env; + int envc; + + env = getenv("OBJTOOL_ARGS"); + if (env) { + envv[0] = "OBJTOOL_ARGS"; + for (envc = 1; envc < ARRAY_SIZE(envv); ) { + envv[envc++] = env; + env = strchr(env, ' '); + if (!env) + break; + *env = '\0'; + env++; + } + + parse_options(envc, envv, check_options, env_usage, 0); + } + + argc = parse_options(argc, argv, check_options, usage, 0); + if (argc != 1) + usage_with_options(usage, check_options); + return argc; +} + int cmd_check(int argc, const char **argv) { const char *objname; struct objtool_file *file; int ret; - argc = parse_options(argc, argv, check_options, check_usage, 0); - - if (argc != 1) - usage_with_options(check_usage, check_options); - + argc = cmd_parse_options(argc, argv, check_usage); objname = argv[0]; file = objtool_open_read(objname); diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c index 8273bbf7cebb..17f8b9307738 100644 --- a/tools/objtool/builtin-orc.c +++ b/tools/objtool/builtin-orc.c @@ -34,10 +34,7 @@ int cmd_orc(int argc, const char **argv) struct objtool_file *file; int ret; - argc = parse_options(argc, argv, check_options, orc_usage, 0); - if (argc != 1) - usage_with_options(orc_usage, check_options); - + argc = cmd_parse_options(argc, argv, orc_usage); objname = argv[0]; file = objtool_open_read(objname); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5e5388a38e2a..9ed1a4cd00dc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -108,6 +108,18 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) +static bool is_jump_table_jump(struct instruction *insn) +{ + struct alt_group *alt_group = insn->alt_group; + + if (insn->jump_table) + return true; + + /* Retpoline alternative for a jump table? */ + return alt_group && alt_group->orig_group && + alt_group->orig_group->first_insn->jump_table; +} + static bool is_sibling_call(struct instruction *insn) { /* @@ -120,7 +132,7 @@ static bool is_sibling_call(struct instruction *insn) /* An indirect jump is either a sibling call or a jump to a table. */ if (insn->type == INSN_JUMP_DYNAMIC) - return list_empty(&insn->alts); + return !is_jump_table_jump(insn); /* add_jump_destinations() sets insn->call_dest for sibling calls. */ return (is_static_jump(insn) && insn->call_dest); @@ -433,8 +445,7 @@ reachable: static int create_static_call_sections(struct objtool_file *file) { - struct section *sec, *reloc_sec; - struct reloc *reloc; + struct section *sec; struct static_call_site *site; struct instruction *insn; struct symbol *key_sym; @@ -452,7 +463,7 @@ static int create_static_call_sections(struct objtool_file *file) return 0; idx = 0; - list_for_each_entry(insn, &file->static_call_list, static_call_node) + list_for_each_entry(insn, &file->static_call_list, call_node) idx++; sec = elf_create_section(file->elf, ".static_call_sites", SHF_WRITE, @@ -460,36 +471,18 @@ static int create_static_call_sections(struct objtool_file *file) if (!sec) return -1; - reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!reloc_sec) - return -1; - idx = 0; - list_for_each_entry(insn, &file->static_call_list, static_call_node) { + list_for_each_entry(insn, &file->static_call_list, call_node) { site = (struct static_call_site *)sec->data->d_buf + idx; memset(site, 0, sizeof(struct static_call_site)); /* populate reloc for 'addr' */ - reloc = malloc(sizeof(*reloc)); - - if (!reloc) { - perror("malloc"); - return -1; - } - memset(reloc, 0, sizeof(*reloc)); - - insn_to_reloc_sym_addend(insn->sec, insn->offset, reloc); - if (!reloc->sym) { - WARN_FUNC("static call tramp: missing containing symbol", - insn->sec, insn->offset); + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(struct static_call_site), + R_X86_64_PC32, + insn->sec, insn->offset)) return -1; - } - - reloc->type = R_X86_64_PC32; - reloc->offset = idx * sizeof(struct static_call_site); - reloc->sec = reloc_sec; - elf_add_reloc(file->elf, reloc); /* find key symbol */ key_name = strdup(insn->call_dest->name); @@ -526,32 +519,21 @@ static int create_static_call_sections(struct objtool_file *file) free(key_name); /* populate reloc for 'key' */ - reloc = malloc(sizeof(*reloc)); - if (!reloc) { - perror("malloc"); + if (elf_add_reloc(file->elf, sec, + idx * sizeof(struct static_call_site) + 4, + R_X86_64_PC32, key_sym, + is_sibling_call(insn) * STATIC_CALL_SITE_TAIL)) return -1; - } - memset(reloc, 0, sizeof(*reloc)); - reloc->sym = key_sym; - reloc->addend = is_sibling_call(insn) ? STATIC_CALL_SITE_TAIL : 0; - reloc->type = R_X86_64_PC32; - reloc->offset = idx * sizeof(struct static_call_site) + 4; - reloc->sec = reloc_sec; - elf_add_reloc(file->elf, reloc); idx++; } - if (elf_rebuild_reloc_section(file->elf, reloc_sec)) - return -1; - return 0; } static int create_mcount_loc_sections(struct objtool_file *file) { - struct section *sec, *reloc_sec; - struct reloc *reloc; + struct section *sec; unsigned long *loc; struct instruction *insn; int idx; @@ -574,49 +556,21 @@ static int create_mcount_loc_sections(struct objtool_file *file) if (!sec) return -1; - reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!reloc_sec) - return -1; - idx = 0; list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) { loc = (unsigned long *)sec->data->d_buf + idx; memset(loc, 0, sizeof(unsigned long)); - reloc = malloc(sizeof(*reloc)); - if (!reloc) { - perror("malloc"); + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(unsigned long), + R_X86_64_64, + insn->sec, insn->offset)) return -1; - } - memset(reloc, 0, sizeof(*reloc)); - - if (insn->sec->sym) { - reloc->sym = insn->sec->sym; - reloc->addend = insn->offset; - } else { - reloc->sym = find_symbol_containing(insn->sec, insn->offset); - - if (!reloc->sym) { - WARN("missing symbol for insn at offset 0x%lx\n", - insn->offset); - return -1; - } - - reloc->addend = insn->offset - reloc->sym->offset; - } - - reloc->type = R_X86_64_64; - reloc->offset = idx * sizeof(unsigned long); - reloc->sec = reloc_sec; - elf_add_reloc(file->elf, reloc); idx++; } - if (elf_rebuild_reloc_section(file->elf, reloc_sec)) - return -1; - return 0; } @@ -850,6 +804,30 @@ static int add_ignore_alternatives(struct objtool_file *file) return 0; } +__weak bool arch_is_retpoline(struct symbol *sym) +{ + return false; +} + +#define NEGATIVE_RELOC ((void *)-1L) + +static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *insn) +{ + if (insn->reloc == NEGATIVE_RELOC) + return NULL; + + if (!insn->reloc) { + insn->reloc = find_reloc_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); + if (!insn->reloc) { + insn->reloc = NEGATIVE_RELOC; + return NULL; + } + } + + return insn->reloc; +} + /* * Find the destination instructions for all jumps. */ @@ -864,16 +842,14 @@ static int add_jump_destinations(struct objtool_file *file) if (!is_static_jump(insn)) continue; - reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + reloc = insn_reloc(file, insn); if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); } else if (reloc->sym->type == STT_SECTION) { dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); - } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) || - !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) { + } else if (arch_is_retpoline(reloc->sym)) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. @@ -883,13 +859,16 @@ static int add_jump_destinations(struct objtool_file *file) else insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; + list_add_tail(&insn->call_node, + &file->retpoline_call_list); + insn->retpoline_safe = true; continue; } else if (insn->func) { /* internal or external sibling call (with reloc) */ insn->call_dest = reloc->sym; if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, + list_add_tail(&insn->call_node, &file->static_call_list); } continue; @@ -951,7 +930,7 @@ static int add_jump_destinations(struct objtool_file *file) /* internal sibling call (without reloc) */ insn->call_dest = insn->jump_dest->func; if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, + list_add_tail(&insn->call_node, &file->static_call_list); } } @@ -995,8 +974,7 @@ static int add_call_destinations(struct objtool_file *file) if (insn->type != INSN_CALL) continue; - reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + reloc = insn_reloc(file, insn); if (!reloc) { dest_off = arch_jump_destination(insn); insn->call_dest = find_call_destination(insn->sec, dest_off); @@ -1026,9 +1004,29 @@ static int add_call_destinations(struct objtool_file *file) dest_off); return -1; } + + } else if (arch_is_retpoline(reloc->sym)) { + /* + * Retpoline calls are really dynamic calls in + * disguise, so convert them accordingly. + */ + insn->type = INSN_CALL_DYNAMIC; + insn->retpoline_safe = true; + + list_add_tail(&insn->call_node, + &file->retpoline_call_list); + + remove_insn_ops(insn); + continue; + } else insn->call_dest = reloc->sym; + if (insn->call_dest && insn->call_dest->static_call_tramp) { + list_add_tail(&insn->call_node, + &file->static_call_list); + } + /* * Many compilers cannot disable KCOV with a function attribute * so they need a little help, NOP out any KCOV calls from noinstr @@ -1175,8 +1173,7 @@ static int handle_group_alt(struct objtool_file *file, * alternatives code can adjust the relative offsets * accordingly. */ - alt_reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + alt_reloc = insn_reloc(file, insn); if (alt_reloc && !arch_support_alt_relocation(special_alt, insn, alt_reloc)) { @@ -1751,6 +1748,11 @@ static void mark_rodata(struct objtool_file *file) file->rodata = found; } +__weak int arch_rewrite_retpolines(struct objtool_file *file) +{ + return 0; +} + static int decode_sections(struct objtool_file *file) { int ret; @@ -1772,10 +1774,17 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_{jump_call}_destination. + */ ret = read_static_call_tramps(file); if (ret) return ret; + /* + * Must be before add_special_section_alts() as that depends on + * jump_dest being set. + */ ret = add_jump_destinations(file); if (ret) return ret; @@ -1784,6 +1793,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_call_destination(); it changes INSN_CALL to + * INSN_JUMP. + */ ret = read_intra_function_calls(file); if (ret) return ret; @@ -1808,6 +1821,15 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be after add_special_section_alts(), since this will emit + * alternatives. Must be after add_{jump,call}_destination(), since + * those create the call insn lists. + */ + ret = arch_rewrite_retpolines(file); + if (ret) + return ret; + return 0; } @@ -1959,8 +1981,9 @@ static void restore_reg(struct cfi_state *cfi, unsigned char reg) * 41 5d pop %r13 * c3 retq */ -static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, - struct stack_op *op) +static int update_cfi_state(struct instruction *insn, + struct instruction *next_insn, + struct cfi_state *cfi, struct stack_op *op) { struct cfi_reg *cfa = &cfi->cfa; struct cfi_reg *regs = cfi->regs; @@ -2019,7 +2042,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, } else if (op->src.reg == CFI_BP && op->dest.reg == CFI_SP && - cfa->base == CFI_BP) { + (cfa->base == CFI_BP || cfa->base == cfi->drap_reg)) { /* * mov %rbp, %rsp @@ -2161,7 +2184,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; } - if (op->dest.reg == cfi->cfa.base) { + if (op->dest.reg == cfi->cfa.base && !(next_insn && next_insn->hint)) { WARN_FUNC("unsupported stack register modification", insn->sec, insn->offset); return -1; @@ -2216,7 +2239,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, cfa->offset = 0; cfi->drap_offset = -1; - } else if (regs[op->dest.reg].offset == -cfi->stack_size) { + } else if (cfi->stack_size == -regs[op->dest.reg].offset) { /* pop %reg */ restore_reg(cfi, op->dest.reg); @@ -2357,26 +2380,6 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; - case OP_DEST_LEAVE: - if ((!cfi->drap && cfa->base != CFI_BP) || - (cfi->drap && cfa->base != cfi->drap_reg)) { - WARN_FUNC("leave instruction with modified stack frame", - insn->sec, insn->offset); - return -1; - } - - /* leave (mov %rbp, %rsp; pop %rbp) */ - - cfi->stack_size = -cfi->regs[CFI_BP].offset - 8; - restore_reg(cfi, CFI_BP); - - if (!cfi->drap) { - cfa->base = CFI_SP; - cfa->offset -= 8; - } - - break; - case OP_DEST_MEM: if (op->src.type != OP_SRC_POP && op->src.type != OP_SRC_POPF) { WARN_FUNC("unknown stack-related memory operation", @@ -2433,13 +2436,15 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn return 0; } -static int handle_insn_ops(struct instruction *insn, struct insn_state *state) +static int handle_insn_ops(struct instruction *insn, + struct instruction *next_insn, + struct insn_state *state) { struct stack_op *op; list_for_each_entry(op, &insn->stack_ops, list) { - if (update_cfi_state(insn, &state->cfi, op)) + if (update_cfi_state(insn, next_insn, &state->cfi, op)) return 1; if (!insn->alt_group) @@ -2722,7 +2727,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; } - if (handle_insn_ops(insn, &state)) + if (handle_insn_ops(insn, next_insn, &state)) return 1; switch (insn->type) { @@ -2746,11 +2751,6 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (dead_end_function(file, insn->call_dest)) return 0; - if (insn->type == INSN_CALL && insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, - &file->static_call_list); - } - break; case INSN_JUMP_CONDITIONAL: diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 93fa833a49a5..d08f5f3670f8 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -211,32 +211,6 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns return find_reloc_by_dest_range(elf, sec, offset, 1); } -void insn_to_reloc_sym_addend(struct section *sec, unsigned long offset, - struct reloc *reloc) -{ - if (sec->sym) { - reloc->sym = sec->sym; - reloc->addend = offset; - return; - } - - /* - * The Clang assembler strips section symbols, so we have to reference - * the function symbol instead: - */ - reloc->sym = find_symbol_containing(sec, offset); - if (!reloc->sym) { - /* - * Hack alert. This happens when we need to reference the NOP - * pad insn immediately after the function. - */ - reloc->sym = find_symbol_containing(sec, offset - 1); - } - - if (reloc->sym) - reloc->addend = offset - reloc->sym->offset; -} - static int read_sections(struct elf *elf) { Elf_Scn *s = NULL; @@ -316,12 +290,39 @@ static int read_sections(struct elf *elf) return 0; } +static void elf_add_symbol(struct elf *elf, struct symbol *sym) +{ + struct list_head *entry; + struct rb_node *pnode; + + sym->type = GELF_ST_TYPE(sym->sym.st_info); + sym->bind = GELF_ST_BIND(sym->sym.st_info); + + sym->offset = sym->sym.st_value; + sym->len = sym->sym.st_size; + + rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); + pnode = rb_prev(&sym->node); + if (pnode) + entry = &rb_entry(pnode, struct symbol, node)->list; + else + entry = &sym->sec->symbol_list; + list_add(&sym->list, entry); + elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx); + elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); + + /* + * Don't store empty STT_NOTYPE symbols in the rbtree. They + * can exist within a function, confusing the sorting. + */ + if (!sym->len) + rb_erase(&sym->node, &sym->sec->symbol_tree); +} + static int read_symbols(struct elf *elf) { struct section *symtab, *symtab_shndx, *sec; struct symbol *sym, *pfunc; - struct list_head *entry; - struct rb_node *pnode; int symbols_nr, i; char *coldstr; Elf_Data *shndx_data = NULL; @@ -366,9 +367,6 @@ static int read_symbols(struct elf *elf) goto err; } - sym->type = GELF_ST_TYPE(sym->sym.st_info); - sym->bind = GELF_ST_BIND(sym->sym.st_info); - if ((sym->sym.st_shndx > SHN_UNDEF && sym->sym.st_shndx < SHN_LORESERVE) || (shndx_data && sym->sym.st_shndx == SHN_XINDEX)) { @@ -381,32 +379,14 @@ static int read_symbols(struct elf *elf) sym->name); goto err; } - if (sym->type == STT_SECTION) { + if (GELF_ST_TYPE(sym->sym.st_info) == STT_SECTION) { sym->name = sym->sec->name; sym->sec->sym = sym; } } else sym->sec = find_section_by_index(elf, 0); - sym->offset = sym->sym.st_value; - sym->len = sym->sym.st_size; - - rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); - pnode = rb_prev(&sym->node); - if (pnode) - entry = &rb_entry(pnode, struct symbol, node)->list; - else - entry = &sym->sec->symbol_list; - list_add(&sym->list, entry); - elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx); - elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); - - /* - * Don't store empty STT_NOTYPE symbols in the rbtree. They - * can exist within a function, confusing the sorting. - */ - if (!sym->len) - rb_erase(&sym->node, &sym->sec->symbol_tree); + elf_add_symbol(elf, sym); } if (stats) @@ -473,12 +453,73 @@ err: return -1; } -void elf_add_reloc(struct elf *elf, struct reloc *reloc) +static struct section *elf_create_reloc_section(struct elf *elf, + struct section *base, + int reltype); + +int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, + unsigned int type, struct symbol *sym, int addend) { - struct section *sec = reloc->sec; + struct reloc *reloc; - list_add_tail(&reloc->list, &sec->reloc_list); + if (!sec->reloc && !elf_create_reloc_section(elf, sec, SHT_RELA)) + return -1; + + reloc = malloc(sizeof(*reloc)); + if (!reloc) { + perror("malloc"); + return -1; + } + memset(reloc, 0, sizeof(*reloc)); + + reloc->sec = sec->reloc; + reloc->offset = offset; + reloc->type = type; + reloc->sym = sym; + reloc->addend = addend; + + list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); + + sec->reloc->changed = true; + + return 0; +} + +int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int type, + struct section *insn_sec, unsigned long insn_off) +{ + struct symbol *sym; + int addend; + + if (insn_sec->sym) { + sym = insn_sec->sym; + addend = insn_off; + + } else { + /* + * The Clang assembler strips section symbols, so we have to + * reference the function symbol instead: + */ + sym = find_symbol_containing(insn_sec, insn_off); + if (!sym) { + /* + * Hack alert. This happens when we need to reference + * the NOP pad insn immediately after the function. + */ + sym = find_symbol_containing(insn_sec, insn_off - 1); + } + + if (!sym) { + WARN("can't find symbol containing %s+0x%lx", insn_sec->name, insn_off); + return -1; + } + + addend = insn_off - sym->offset; + } + + return elf_add_reloc(elf, sec, offset, type, sym, addend); } static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) @@ -558,7 +599,9 @@ static int read_relocs(struct elf *elf) return -1; } - elf_add_reloc(elf, reloc); + list_add_tail(&reloc->list, &sec->reloc_list); + elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); + nr_reloc++; } max_reloc = max(max_reloc, nr_reloc); @@ -636,13 +679,108 @@ err: return NULL; } +static int elf_add_string(struct elf *elf, struct section *strtab, char *str) +{ + Elf_Data *data; + Elf_Scn *s; + int len; + + if (!strtab) + strtab = find_section_by_name(elf, ".strtab"); + if (!strtab) { + WARN("can't find .strtab section"); + return -1; + } + + s = elf_getscn(elf->elf, strtab->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return -1; + } + + data = elf_newdata(s); + if (!data) { + WARN_ELF("elf_newdata"); + return -1; + } + + data->d_buf = str; + data->d_size = strlen(str) + 1; + data->d_align = 1; + + len = strtab->len; + strtab->len += data->d_size; + strtab->changed = true; + + return len; +} + +struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) +{ + struct section *symtab; + struct symbol *sym; + Elf_Data *data; + Elf_Scn *s; + + sym = malloc(sizeof(*sym)); + if (!sym) { + perror("malloc"); + return NULL; + } + memset(sym, 0, sizeof(*sym)); + + sym->name = strdup(name); + + sym->sym.st_name = elf_add_string(elf, NULL, sym->name); + if (sym->sym.st_name == -1) + return NULL; + + sym->sym.st_info = GELF_ST_INFO(STB_GLOBAL, STT_NOTYPE); + // st_other 0 + // st_shndx 0 + // st_value 0 + // st_size 0 + + symtab = find_section_by_name(elf, ".symtab"); + if (!symtab) { + WARN("can't find .symtab"); + return NULL; + } + + s = elf_getscn(elf->elf, symtab->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return NULL; + } + + data = elf_newdata(s); + if (!data) { + WARN_ELF("elf_newdata"); + return NULL; + } + + data->d_buf = &sym->sym; + data->d_size = sizeof(sym->sym); + data->d_align = 1; + + sym->idx = symtab->len / sizeof(sym->sym); + + symtab->len += data->d_size; + symtab->changed = true; + + sym->sec = find_section_by_index(elf, 0); + + elf_add_symbol(elf, sym); + + return sym; +} + struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr) { struct section *sec, *shstrtab; size_t size = entsize * nr; Elf_Scn *s; - Elf_Data *data; sec = malloc(sizeof(*sec)); if (!sec) { @@ -699,7 +837,6 @@ struct section *elf_create_section(struct elf *elf, const char *name, sec->sh.sh_addralign = 1; sec->sh.sh_flags = SHF_ALLOC | sh_flags; - /* Add section name to .shstrtab (or .strtab for Clang) */ shstrtab = find_section_by_name(elf, ".shstrtab"); if (!shstrtab) @@ -708,27 +845,9 @@ struct section *elf_create_section(struct elf *elf, const char *name, WARN("can't find .shstrtab or .strtab section"); return NULL; } - - s = elf_getscn(elf->elf, shstrtab->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return NULL; - } - - data = elf_newdata(s); - if (!data) { - WARN_ELF("elf_newdata"); + sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name); + if (sec->sh.sh_name == -1) return NULL; - } - - data->d_buf = sec->name; - data->d_size = strlen(name) + 1; - data->d_align = 1; - - sec->sh.sh_name = shstrtab->len; - - shstrtab->len += strlen(name) + 1; - shstrtab->changed = true; list_add_tail(&sec->list, &elf->sections); elf_hash_add(elf->section_hash, &sec->hash, sec->idx); @@ -799,7 +918,7 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec return sec; } -struct section *elf_create_reloc_section(struct elf *elf, +static struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype) { @@ -873,14 +992,11 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) return 0; } -int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) +static int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) { struct reloc *reloc; int nr; - sec->changed = true; - elf->changed = true; - nr = 0; list_for_each_entry(reloc, &sec->reloc_list, list) nr++; @@ -944,9 +1060,15 @@ int elf_write(struct elf *elf) struct section *sec; Elf_Scn *s; - /* Update section headers for changed sections: */ + /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->changed) { + if (sec->base && + elf_rebuild_reloc_section(elf, sec)) { + WARN("elf_rebuild_reloc_section"); + return -1; + } + s = elf_getscn(elf->elf, sec->idx); if (!s) { WARN_ELF("elf_getscn"); @@ -958,6 +1080,7 @@ int elf_write(struct elf *elf) } sec->changed = false; + elf->changed = true; } } diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 6ff0685f5cc5..062bb6e9b865 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -35,7 +35,6 @@ enum op_dest_type { OP_DEST_MEM, OP_DEST_PUSH, OP_DEST_PUSHF, - OP_DEST_LEAVE, }; struct op_dest { @@ -86,4 +85,8 @@ const char *arch_nop_insn(int len); int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg); +bool arch_is_retpoline(struct symbol *sym); + +int arch_rewrite_retpolines(struct objtool_file *file); + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 2502bb27de17..15ac0b7d3d6a 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -8,7 +8,10 @@ #include <subcmd/parse-options.h> extern const struct option check_options[]; -extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr; +extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, + validate_dup, vmlinux, mcount, noinstr, backup; + +extern int cmd_parse_options(int argc, const char **argv, const char * const usage[]); extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index f5be798107bc..56d50bc50c10 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -39,7 +39,7 @@ struct alt_group { struct instruction { struct list_head list; struct hlist_node hash; - struct list_head static_call_node; + struct list_head call_node; struct list_head mcount_loc_node; struct section *sec; unsigned long offset; @@ -56,6 +56,7 @@ struct instruction { struct instruction *jump_dest; struct instruction *first_jump_src; struct reloc *jump_table; + struct reloc *reloc; struct list_head alts; struct symbol *func; struct list_head stack_ops; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index e6890cc70a25..45e5ede363b0 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -122,12 +122,18 @@ static inline u32 reloc_hash(struct reloc *reloc) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); -struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype); -void elf_add_reloc(struct elf *elf, struct reloc *reloc); + +int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, + unsigned int type, struct symbol *sym, int addend); +int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int type, + struct section *insn_sec, unsigned long insn_off); + int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int len, const char *insn); int elf_write_reloc(struct elf *elf, struct reloc *reloc); +struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name); int elf_write(struct elf *elf); void elf_close(struct elf *elf); @@ -140,9 +146,6 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); -void insn_to_reloc_sym_addend(struct section *sec, unsigned long offset, - struct reloc *reloc); -int elf_rebuild_reloc_section(struct elf *elf, struct section *sec); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index e68e37476c15..e4084afb2304 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -18,6 +18,7 @@ struct objtool_file { struct elf *elf; struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 20); + struct list_head retpoline_call_list; struct list_head static_call_list; struct list_head mcount_loc_list; bool ignore_unreachables, c_file, hints, rodata; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 7b97ce499405..e21db8bce493 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -17,6 +17,7 @@ #include <stdbool.h> #include <string.h> #include <stdlib.h> +#include <unistd.h> #include <subcmd/exec-cmd.h> #include <subcmd/pager.h> #include <linux/kernel.h> @@ -44,6 +45,64 @@ bool help; const char *objname; static struct objtool_file file; +static bool objtool_create_backup(const char *_objname) +{ + int len = strlen(_objname); + char *buf, *base, *name = malloc(len+6); + int s, d, l, t; + + if (!name) { + perror("failed backup name malloc"); + return false; + } + + strcpy(name, _objname); + strcpy(name + len, ".orig"); + + d = open(name, O_CREAT|O_WRONLY|O_TRUNC, 0644); + if (d < 0) { + perror("failed to create backup file"); + return false; + } + + s = open(_objname, O_RDONLY); + if (s < 0) { + perror("failed to open orig file"); + return false; + } + + buf = malloc(4096); + if (!buf) { + perror("failed backup data malloc"); + return false; + } + + while ((l = read(s, buf, 4096)) > 0) { + base = buf; + do { + t = write(d, base, l); + if (t < 0) { + perror("failed backup write"); + return false; + } + base += t; + l -= t; + } while (l); + } + + if (l < 0) { + perror("failed backup read"); + return false; + } + + free(name); + free(buf); + close(d); + close(s); + + return true; +} + struct objtool_file *objtool_open_read(const char *_objname) { if (objname) { @@ -59,8 +118,14 @@ struct objtool_file *objtool_open_read(const char *_objname) if (!file.elf) return NULL; + if (backup && !objtool_create_backup(objname)) { + WARN("can't create backup file"); + return NULL; + } + INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); + INIT_LIST_HEAD(&file.retpoline_call_list); INIT_LIST_HEAD(&file.static_call_list); INIT_LIST_HEAD(&file.mcount_loc_list); file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment"); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 738aa5021bc4..dc9b7dd314b0 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -82,12 +82,11 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi) } static int write_orc_entry(struct elf *elf, struct section *orc_sec, - struct section *ip_rsec, unsigned int idx, + struct section *ip_sec, unsigned int idx, struct section *insn_sec, unsigned long insn_off, struct orc_entry *o) { struct orc_entry *orc; - struct reloc *reloc; /* populate ORC data */ orc = (struct orc_entry *)orc_sec->data->d_buf + idx; @@ -96,25 +95,9 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec, orc->bp_offset = bswap_if_needed(orc->bp_offset); /* populate reloc for ip */ - reloc = malloc(sizeof(*reloc)); - if (!reloc) { - perror("malloc"); + if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32, + insn_sec, insn_off)) return -1; - } - memset(reloc, 0, sizeof(*reloc)); - - insn_to_reloc_sym_addend(insn_sec, insn_off, reloc); - if (!reloc->sym) { - WARN("missing symbol for insn at offset 0x%lx", - insn_off); - return -1; - } - - reloc->type = R_X86_64_PC32; - reloc->offset = idx * sizeof(int); - reloc->sec = ip_rsec; - - elf_add_reloc(elf, reloc); return 0; } @@ -153,7 +136,7 @@ static unsigned long alt_group_len(struct alt_group *alt_group) int orc_create(struct objtool_file *file) { - struct section *sec, *ip_rsec, *orc_sec; + struct section *sec, *orc_sec; unsigned int nr = 0, idx = 0; struct orc_list_entry *entry; struct list_head orc_list; @@ -242,20 +225,14 @@ int orc_create(struct objtool_file *file) sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), nr); if (!sec) return -1; - ip_rsec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!ip_rsec) - return -1; /* Write ORC entries to sections: */ list_for_each_entry(entry, &orc_list, list) { - if (write_orc_entry(file->elf, orc_sec, ip_rsec, idx++, + if (write_orc_entry(file->elf, orc_sec, sec, idx++, entry->insn_sec, entry->insn_off, &entry->orc)) return -1; } - if (elf_rebuild_reloc_section(file->elf, ip_rsec)) - return -1; - return 0; } diff --git a/tools/objtool/special.c b/tools/objtool/special.c index 2c7fbda7b055..07b21cfabf5c 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -106,6 +106,14 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, return -1; } + /* + * Skip retpoline .altinstr_replacement... we already rewrite the + * instructions for retpolines anyway, see arch_is_retpoline() + * usage in add_{call,jump}_destinations(). + */ + if (arch_is_retpoline(new_reloc->sym)) + return 1; + alt->new_sec = new_reloc->sym->sec; alt->new_off = (unsigned int)new_reloc->addend; @@ -154,7 +162,9 @@ int special_get_alts(struct elf *elf, struct list_head *alts) memset(alt, 0, sizeof(*alt)); ret = get_alt_entry(elf, entry, sec, idx, alt); - if (ret) + if (ret > 0) + continue; + if (ret < 0) return ret; list_add_tail(&alt->list, alts); diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 606a4b5e929f..105a291ff8e7 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -10,17 +10,21 @@ FILES="include/linux/objtool.h" if [ "$SRCARCH" = "x86" ]; then FILES="$FILES +arch/x86/include/asm/nops.h arch/x86/include/asm/inat_types.h arch/x86/include/asm/orc_types.h arch/x86/include/asm/emulate_prefix.h arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk include/linux/static_call_types.h -arch/x86/include/asm/inat.h -I '^#include [\"<]\(asm/\)*inat_types.h[\">]' -arch/x86/include/asm/insn.h -I '^#include [\"<]\(asm/\)*inat.h[\">]' -arch/x86/lib/inat.c -I '^#include [\"<]\(../include/\)*asm/insn.h[\">]' -arch/x86/lib/insn.c -I '^#include [\"<]\(../include/\)*asm/in\(at\|sn\).h[\">]' -I '^#include [\"<]\(../include/\)*asm/emulate_prefix.h[\">]' " + +SYNC_CHECK_FILES=' +arch/x86/include/asm/inat.h +arch/x86/include/asm/insn.h +arch/x86/lib/inat.c +arch/x86/lib/insn.c +' fi check_2 () { @@ -63,3 +67,9 @@ while read -r file_entry; do done <<EOF $FILES EOF + +if [ "$SRCARCH" = "x86" ]; then + for i in $SYNC_CHECK_FILES; do + check $i '-I "^.*\/\*.*__ignore_sync_check__.*\*\/.*$"' + done +fi diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5d7b947320fb..f05c4d48fd7e 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -20,4 +20,4 @@ tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c tools/lib/zalloc.c -scripts/bpf_helpers_doc.py +scripts/bpf_doc.py diff --git a/tools/perf/arch/x86/tests/insn-x86.c b/tools/perf/arch/x86/tests/insn-x86.c index 4f75ae990140..0262b0d8ccf5 100644 --- a/tools/perf/arch/x86/tests/insn-x86.c +++ b/tools/perf/arch/x86/tests/insn-x86.c @@ -96,13 +96,12 @@ static int get_branch(const char *branch_str) static int test_data_item(struct test_data *dat, int x86_64) { struct intel_pt_insn intel_pt_insn; + int op, branch, ret; struct insn insn; - int op, branch; - insn_init(&insn, dat->data, MAX_INSN_SIZE, x86_64); - insn_get_length(&insn); - - if (!insn_complete(&insn)) { + ret = insn_decode(&insn, dat->data, MAX_INSN_SIZE, + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + if (ret < 0) { pr_debug("Failed to decode: %s\n", dat->asm_rep); return -1; } diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c index 34d600c51044..546feda08428 100644 --- a/tools/perf/arch/x86/util/archinsn.c +++ b/tools/perf/arch/x86/util/archinsn.c @@ -11,7 +11,7 @@ void arch_fetch_insn(struct perf_sample *sample, struct machine *machine) { struct insn insn; - int len; + int len, ret; bool is64bit = false; if (!sample->ip) @@ -19,8 +19,9 @@ void arch_fetch_insn(struct perf_sample *sample, len = thread__memcpy(thread, machine, sample->insn, sample->ip, sizeof(sample->insn), &is64bit); if (len <= 0) return; - insn_init(&insn, sample->insn, len, is64bit); - insn_get_length(&insn); - if (insn_complete(&insn) && insn.length <= len) + + ret = insn_decode(&insn, sample->insn, len, + is64bit ? INSN_MODE_64 : INSN_MODE_32); + if (ret >= 0 && insn.length <= len) sample->insn_len = insn.length; } diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index a6420c647959..6df0dc00d73a 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -776,6 +776,12 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, } } + if (!opts->auxtrace_snapshot_mode && !opts->auxtrace_sample_mode) { + u32 aux_watermark = opts->auxtrace_mmap_pages * page_size / 4; + + intel_pt_evsel->core.attr.aux_watermark = aux_watermark; + } + intel_pt_parse_terms(intel_pt_pmu->name, &intel_pt_pmu->format, "tsc", &tsc_bit); diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index ace8772a4f03..7c4a9d424a64 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -402,35 +402,42 @@ static pid_t handle_signalfd(struct daemon *daemon) int status; pid_t pid; + /* + * Take signal fd data as pure signal notification and check all + * the sessions state. The reason is that multiple signals can get + * coalesced in kernel and we can receive only single signal even + * if multiple SIGCHLD were generated. + */ err = read(daemon->signal_fd, &si, sizeof(struct signalfd_siginfo)); - if (err != sizeof(struct signalfd_siginfo)) + if (err != sizeof(struct signalfd_siginfo)) { + pr_err("failed to read signal fd\n"); return -1; + } list_for_each_entry(session, &daemon->sessions, list) { + if (session->pid == -1) + continue; - if (session->pid != (int) si.ssi_pid) + pid = waitpid(session->pid, &status, WNOHANG); + if (pid <= 0) continue; - pid = waitpid(session->pid, &status, 0); - if (pid == session->pid) { - if (WIFEXITED(status)) { - pr_info("session '%s' exited, status=%d\n", - session->name, WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { - pr_info("session '%s' killed (signal %d)\n", - session->name, WTERMSIG(status)); - } else if (WIFSTOPPED(status)) { - pr_info("session '%s' stopped (signal %d)\n", - session->name, WSTOPSIG(status)); - } else { - pr_info("session '%s' Unexpected status (0x%x)\n", - session->name, status); - } + if (WIFEXITED(status)) { + pr_info("session '%s' exited, status=%d\n", + session->name, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + pr_info("session '%s' killed (signal %d)\n", + session->name, WTERMSIG(status)); + } else if (WIFSTOPPED(status)) { + pr_info("session '%s' stopped (signal %d)\n", + session->name, WSTOPSIG(status)); + } else { + pr_info("session '%s' Unexpected status (0x%x)\n", + session->name, status); } session->state = KILL; session->pid = -1; - return pid; } return 0; @@ -443,7 +450,6 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d .fd = daemon->signal_fd, .events = POLLIN, }; - pid_t wpid = 0, pid = session->pid; time_t start; start = time(NULL); @@ -452,7 +458,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d int err = poll(&pollfd, 1, 1000); if (err > 0) { - wpid = handle_signalfd(daemon); + handle_signalfd(daemon); } else if (err < 0) { perror("failed: poll\n"); return -1; @@ -460,7 +466,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d if (start + secs < time(NULL)) return -1; - } while (wpid != pid); + } while (session->pid != -1); return 0; } @@ -902,7 +908,9 @@ static void daemon_session__kill(struct daemon_session *session, daemon_session__signal(session, SIGKILL); break; default: - break; + pr_err("failed to wait for session %s\n", + session->name); + return; } how++; @@ -955,7 +963,8 @@ static void daemon__kill(struct daemon *daemon) daemon__signal(daemon, SIGKILL); break; default: - break; + pr_err("failed to wait for sessions\n"); + return; } how++; @@ -1344,7 +1353,7 @@ out: close(sock_fd); if (conf_fd != -1) close(conf_fd); - if (conf_fd != -1) + if (signal_fd != -1) close(signal_fd); pr_info("daemon exited\n"); diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index d49448a1060c..87cb11a7a3ee 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -289,7 +289,7 @@ static int set_tracing_pid(struct perf_ftrace *ftrace) for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) { scnprintf(buf, sizeof(buf), "%d", - ftrace->evlist->core.threads->map[i]); + perf_thread_map__pid(ftrace->evlist->core.threads, i)); if (append_tracing_file("set_ftrace_pid", buf) < 0) return -1; } diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 6fe44d97fde5..ddccc0eb7390 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -906,7 +906,7 @@ int cmd_inject(int argc, const char **argv) } data.path = inject.input_name; - inject.session = perf_session__new(&data, true, &inject.tool); + inject.session = perf_session__new(&data, inject.output.is_pipe, &inject.tool); if (IS_ERR(inject.session)) return PTR_ERR(inject.session); diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index dded93a2bc89..07857dfb4d91 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -75,6 +75,13 @@ include/uapi/asm-generic/mman-common.h include/uapi/asm-generic/unistd.h ' +SYNC_CHECK_FILES=' +arch/x86/include/asm/inat.h +arch/x86/include/asm/insn.h +arch/x86/lib/inat.c +arch/x86/lib/insn.c +' + # These copies are under tools/perf/trace/beauty/ as they are not used to in # building object files only by scripts in tools/perf/trace/beauty/ to generate # tables that then gets included in .c files for things like id->string syscall @@ -129,6 +136,10 @@ for i in $FILES; do check $i -B done +for i in $SYNC_CHECK_FILES; do + check $i '-I "^.*\/\*.*__ignore_sync_check__.*\*\/.*$"' +done + # diff with extra ignore lines check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>" -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))"' check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>" -I"^SYM_FUNC_START\(_LOCAL\)*(memset_\(erms\|orig\))"' @@ -137,10 +148,6 @@ check include/uapi/linux/mman.h '-I "^#include <\(uapi/\)*asm/mman.h>"' check include/linux/build_bug.h '-I "^#\(ifndef\|endif\)\( \/\/\)* static_assert$"' check include/linux/ctype.h '-I "isdigit("' check lib/ctype.c '-I "^EXPORT_SYMBOL" -I "^#include <linux/export.h>" -B' -check arch/x86/include/asm/inat.h '-I "^#include [\"<]\(asm/\)*inat_types.h[\">]"' -check arch/x86/include/asm/insn.h '-I "^#include [\"<]\(asm/\)*inat.h[\">]"' -check arch/x86/lib/inat.c '-I "^#include [\"<]\(../include/\)*asm/insn.h[\">]"' -check arch/x86/lib/insn.c '-I "^#include [\"<]\(../include/\)*asm/in\(at\|sn\).h[\">]" -I "^#include [\"<]\(../include/\)*asm/emulate_prefix.h[\">]"' # diff non-symmetric files check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index f57e075b0ed2..c72adbd67386 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -86,7 +86,7 @@ static struct { .msg_load_fail = "check your vmlinux setting?", .target_func = &epoll_pwait_loop, .expect_result = (NR_ITERS + 1) / 2, - .pin = true, + .pin = true, }, #ifdef HAVE_BPF_PROLOGUE { @@ -99,13 +99,6 @@ static struct { .expect_result = (NR_ITERS + 1) / 4, }, #endif - { - .prog_id = LLVM_TESTCASE_BPF_RELOCATION, - .desc = "BPF relocation checker", - .name = "[bpf_relocation_test]", - .msg_compile_fail = "fix 'perf test LLVM' first", - .msg_load_fail = "libbpf error when dealing with relocation", - }, }; static int do_test(struct bpf_object *obj, int (*func)(void), diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 5ad3ca8d681b..58984380b211 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # daemon operations # SPDX-License-Identifier: GPL-2.0 diff --git a/tools/perf/trace/beauty/tracepoints/x86_msr.sh b/tools/perf/trace/beauty/tracepoints/x86_msr.sh index 27ee1ea1fe94..9b0614a87831 100755 --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh @@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr-index.h printf "static const char *x86_MSRs[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*' -egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSCDEADLINE|UCODE_REV)|IDT_FCR4)' | \ +egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSC_DEADLINE|UCODE_REV)|IDT_FCR4)' | \ sed -r "s/$regex/\2 \1/g" | sort -n | \ xargs printf "\t[%s] = \"%s\",\n" printf "};\n\n" diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c index f3ac9d40cebf..2e5eff4f8f03 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c +++ b/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c @@ -210,8 +210,10 @@ static int arm_spe_do_get_packet(const unsigned char *buf, size_t len, if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_EXTENDED) { /* 16-bit extended format header */ - ext_hdr = 1; + if (len == 1) + return ARM_SPE_BAD_PACKET; + ext_hdr = 1; hdr = buf[1]; if (hdr == SPE_HEADER1_ALIGNMENT) return arm_spe_get_alignment(buf, len, packet); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 953f4afacd3b..1b4091a3b508 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -298,10 +298,6 @@ static int auxtrace_queues__queue_buffer(struct auxtrace_queues *queues, queue->set = true; queue->tid = buffer->tid; queue->cpu = buffer->cpu; - } else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) { - pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n", - queue->cpu, queue->tid, buffer->cpu, buffer->tid); - return -EINVAL; } buffer->buffer_nr = queues->next_buffer_nr++; @@ -638,7 +634,7 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr, break; } - if (itr) + if (itr && itr->parse_snapshot_options) return itr->parse_snapshot_options(itr, opts, str); pr_err("No AUX area tracing to snapshot\n"); diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 423ec69bda6c..5ecd4f401f32 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -201,7 +201,7 @@ static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt, double ratio = 0.0; if (block_fmt->total_cycles) - ratio = (double)bi->cycles / (double)block_fmt->total_cycles; + ratio = (double)bi->cycles_aggr / (double)block_fmt->total_cycles; return color_pct(hpp, block_fmt->width, 100.0 * ratio); } @@ -216,9 +216,9 @@ static int64_t block_total_cycles_pct_sort(struct perf_hpp_fmt *fmt, double l, r; if (block_fmt->total_cycles) { - l = ((double)bi_l->cycles / + l = ((double)bi_l->cycles_aggr / (double)block_fmt->total_cycles) * 100000.0; - r = ((double)bi_r->cycles / + r = ((double)bi_r->cycles_aggr / (double)block_fmt->total_cycles) * 100000.0; return (int64_t)l - (int64_t)r; } diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 57d58c81a5f8..cdecda1ddd36 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -196,25 +196,32 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, } if (info_linear->info_len < offsetof(struct bpf_prog_info, prog_tags)) { + free(info_linear); pr_debug("%s: the kernel is too old, aborting\n", __func__); return -2; } info = &info_linear->info; + if (!info->jited_ksyms) { + free(info_linear); + return -1; + } /* number of ksyms, func_lengths, and tags should match */ sub_prog_cnt = info->nr_jited_ksyms; if (sub_prog_cnt != info->nr_prog_tags || - sub_prog_cnt != info->nr_jited_func_lens) + sub_prog_cnt != info->nr_jited_func_lens) { + free(info_linear); return -1; + } /* check BTF func info support */ if (info->btf_id && info->nr_func_info && info->func_info_rec_size) { /* btf func info number should be same as sub_prog_cnt */ if (sub_prog_cnt != info->nr_func_info) { pr_debug("%s: mismatch in BPF sub program count and BTF function info count, aborting\n", __func__); - err = -1; - goto out; + free(info_linear); + return -1; } if (btf__get_from_id(info->btf_id, &btf)) { pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id); diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index f29af4fc3d09..8fca4779ae6a 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data) int perf_data__create_dir(struct perf_data *data, int nr) { struct perf_data_file *files = NULL; - int i, ret = -1; + int i, ret; if (WARN_ON(!data->is_dir)) return -EINVAL; @@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr) for (i = 0; i < nr; i++) { struct perf_data_file *file = &files[i]; - if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0) + ret = asprintf(&file->path, "%s/data.%d", data->path, i); + if (ret < 0) goto out_err; ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 2f6cc7eea251..593f20e9774c 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -169,11 +169,13 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64, struct intel_pt_insn *intel_pt_insn) { struct insn insn; + int ret; - insn_init(&insn, buf, len, x86_64); - insn_get_length(&insn); - if (!insn_complete(&insn) || insn.length > len) + ret = insn_decode(&insn, buf, len, + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + if (ret < 0 || insn.length > len) return -1; + intel_pt_insn_decoder(&insn, intel_pt_insn); if (insn.length < INTEL_PT_INSN_BUF_SZ) memcpy(intel_pt_insn->buf, buf, insn.length); @@ -194,12 +196,13 @@ const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused, u8 *inbuf, int inlen, int *lenp) { struct insn insn; - int n, i; + int n, i, ret; int left; - insn_init(&insn, inbuf, inlen, x->is64bit); - insn_get_length(&insn); - if (!insn_complete(&insn) || insn.length > inlen) + ret = insn_decode(&insn, inbuf, inlen, + x->is64bit ? INSN_MODE_64 : INSN_MODE_32); + + if (ret < 0 || insn.length > inlen) return "<bad>"; if (lenp) *lenp = insn.length; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index fbc40a2c17d4..8af693d9678c 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -840,15 +840,18 @@ out: int maps__clone(struct thread *thread, struct maps *parent) { struct maps *maps = thread->maps; - int err = -ENOMEM; + int err; struct map *map; down_read(&parent->lock); maps__for_each_entry(parent, map) { struct map *new = map__clone(map); - if (new == NULL) + + if (new == NULL) { + err = -ENOMEM; goto out_unlock; + } err = unwind__prepare_access(maps, new, NULL); if (err) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 42c84adeb2fb..c0c0fab22cb8 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -356,6 +356,9 @@ __add_event(struct list_head *list, int *idx, struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) : cpu_list ? perf_cpu_map__new(cpu_list) : NULL; + if (pmu && attr->type == PERF_TYPE_RAW) + perf_pmu__warn_invalid_config(pmu, attr->config, name); + if (init_attr) event_attr_init(attr); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 44ef28302fc7..46fd0f998484 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1812,3 +1812,36 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) return nr_caps; } + +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name) +{ + struct perf_pmu_format *format; + __u64 masks = 0, bits; + char buf[100]; + unsigned int i; + + list_for_each_entry(format, &pmu->format, list) { + if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG) + continue; + + for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS) + masks |= 1ULL << i; + } + + /* + * Kernel doesn't export any valid format bits. + */ + if (masks == 0) + return; + + bits = config & ~masks; + if (bits == 0) + return; + + bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf)); + + pr_warning("WARNING: event '%s' not valid (bits %s of config " + "'%llx' not supported by kernel)!\n", + name ?: "N/A", buf, config); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 8164388478c6..160b0f561771 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -123,4 +123,7 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval); int perf_pmu__caps_parse(struct perf_pmu *pmu); +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name); + #endif /* __PMU_H */ diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b698046ec2db..dff178103ce5 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -424,7 +424,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, while (!io.eof) { static const char anonstr[] = "//anon"; - size_t size; + size_t size, aligned_size; /* ensure null termination since stack will be reused. */ event->mmap2.filename[0] = '\0'; @@ -484,11 +484,12 @@ out: } size = strlen(event->mmap2.filename) + 1; - size = PERF_ALIGN(size, sizeof(u64)); + aligned_size = PERF_ALIGN(size, sizeof(u64)); event->mmap2.len -= event->mmap.start; event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); + (sizeof(event->mmap2.filename) - aligned_size)); + memset(event->mmap2.filename + size, 0, machine->id_hdr_size + + (aligned_size - size)); event->mmap2.header.size += machine->id_hdr_size; event->mmap2.pid = tgid; event->mmap2.tid = pid; @@ -758,7 +759,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, for (i = 0; i < n; i++) { char *end; pid_t _pid; - bool kernel_thread; + bool kernel_thread = false; _pid = strtol(dirent[i]->d_name, &end, 10); if (*end) diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index 3cc91ad048ea..43beb169631d 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -133,6 +133,8 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s if (dso != NULL) { __dsos__add(&machine->dsos, dso); dso__set_long_name(dso, long_name, false); + /* Put dso here because __dsos_add already got it */ + dso__put(dso); } return dso; diff --git a/tools/power/acpi/common/cmfsize.c b/tools/power/acpi/common/cmfsize.c index 9ea2c0aeb86c..185b8c588e1d 100644 --- a/tools/power/acpi/common/cmfsize.c +++ b/tools/power/acpi/common/cmfsize.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * - * Module Name: cfsize - Common get file size function + * Module Name: cmfsize - Common get file size function * * Copyright (C) 2000 - 2021, Intel Corp. * diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 81f4b8abbdf7..ffd50953a024 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -6819,7 +6819,7 @@ if __name__ == '__main__': sysvals.outdir = val sysvals.notestrun = True if(os.path.isdir(val) == False): - doError('%s is not accesible' % val) + doError('%s is not accessible' % val) elif(arg == '-filter'): try: val = next(args) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 582feb88eca3..ab940c508ef0 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -15,7 +15,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.8"; +static const char *version_str = "v1.9"; static const int supported_api_ver = 1; static struct isst_if_platform_info isst_platform_info; static char *progname; @@ -381,6 +381,18 @@ static void set_cpu_online_offline(int cpu, int state) close(fd); } +static void force_all_cpus_online(void) +{ + int i; + + fprintf(stderr, "Forcing all CPUs online\n"); + + for (i = 0; i < topo_max_cpus; ++i) + set_cpu_online_offline(i, 1); + + unlink("/var/run/isst_cpu_topology.dat"); +} + #define MAX_PACKAGE_COUNT 8 #define MAX_DIE_PER_PACKAGE 2 static void for_each_online_package_in_set(void (*callback)(int, void *, void *, @@ -959,6 +971,10 @@ static void isst_print_extended_platform_info(void) fprintf(outf, "Intel(R) SST-BF (feature base-freq) is not supported\n"); ret = isst_read_pm_config(i, &cp_state, &cp_cap); + if (ret) { + fprintf(outf, "Intel(R) SST-CP (feature core-power) status is unknown\n"); + return; + } if (cp_cap) fprintf(outf, "Intel(R) SST-CP (feature core-power) is supported\n"); else @@ -2763,6 +2779,7 @@ static void usage(void) printf("\t[-f|--format] : output format [json|text]. Default: text\n"); printf("\t[-h|--help] : Print help\n"); printf("\t[-i|--info] : Print platform information\n"); + printf("\t[-a|--all-cpus-online] : Force online every CPU in the system\n"); printf("\t[-o|--out] : Output file\n"); printf("\t\t\tDefault : stderr\n"); printf("\t[-p|--pause] : Delay between two mail box commands in milliseconds\n"); @@ -2791,7 +2808,6 @@ static void usage(void) static void print_version(void) { fprintf(outf, "Version %s\n", version_str); - fprintf(outf, "Build date %s time %s\n", __DATE__, __TIME__); exit(0); } @@ -2800,11 +2816,12 @@ static void cmdline(int argc, char **argv) const char *pathname = "/dev/isst_interface"; char *ptr; FILE *fp; - int opt; + int opt, force_cpus_online = 0; int option_index = 0; int ret; static struct option long_options[] = { + { "all-cpus-online", no_argument, 0, 'a' }, { "cpu", required_argument, 0, 'c' }, { "debug", no_argument, 0, 'd' }, { "format", required_argument, 0, 'f' }, @@ -2840,9 +2857,12 @@ static void cmdline(int argc, char **argv) } progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+c:df:hio:v", long_options, + while ((opt = getopt_long_only(argc, argv, "+c:df:hio:va", long_options, &option_index)) != -1) { switch (opt) { + case 'a': + force_cpus_online = 1; + break; case 'c': parse_cpu_command(optarg); break; @@ -2892,6 +2912,8 @@ static void cmdline(int argc, char **argv) exit(0); } set_max_cpu_num(); + if (force_cpus_online) + force_all_cpus_online(); store_cpu_topology(); set_cpu_present_cpu_mask(); set_cpu_target_cpu_mask(); diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 8e54ce47648e..3bf1820c0da1 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -25,10 +25,14 @@ static void printcpulist(int str_len, char *str, int mask_size, index = snprintf(&str[curr_index], str_len - curr_index, ","); curr_index += index; + if (curr_index >= str_len) + break; } index = snprintf(&str[curr_index], str_len - curr_index, "%d", i); curr_index += index; + if (curr_index >= str_len) + break; first = 0; } } @@ -64,10 +68,14 @@ static void printcpumask(int str_len, char *str, int mask_size, index = snprintf(&str[curr_index], str_len - curr_index, "%08x", mask[i]); curr_index += index; + if (curr_index >= str_len) + break; if (i) { strncat(&str[curr_index], ",", str_len - curr_index); curr_index++; } + if (curr_index >= str_len) + break; } free(mask); @@ -185,7 +193,7 @@ static void _isst_pbf_display_information(int cpu, FILE *outf, int level, int disp_level) { char header[256]; - char value[256]; + char value[512]; snprintf(header, sizeof(header), "speed-select-base-freq-properties"); format_and_print(outf, disp_level, header, NULL); @@ -349,7 +357,7 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { char header[256]; - char value[256]; + char value[512]; static int level; int i; diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a7c4f0772e53..5939615265f1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2449,7 +2449,7 @@ dump_knl_turbo_ratio_limits(void) fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); - /** + /* * Turbo encoding in KNL is as follows: * [0] -- Reserved * [7:1] -- Base value of number of active cores of bucket 1. diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a402f32a145c..25adfec2cb39 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -39,8 +39,6 @@ EXTRA_WARNINGS += -Wundef EXTRA_WARNINGS += -Wwrite-strings EXTRA_WARNINGS += -Wformat -CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) - # Makefiles suck: This macro sets a default value of $(2) for the # variable named by $(1), unless the variable has been set by # environment or command line. This is necessary for CC and AR @@ -52,12 +50,22 @@ define allow-override $(eval $(1) = $(2))) endef +ifneq ($(LLVM),) +$(call allow-override,CC,clang) +$(call allow-override,AR,llvm-ar) +$(call allow-override,LD,ld.lld) +$(call allow-override,CXX,clang++) +$(call allow-override,STRIP,llvm-strip) +else # Allow setting various cross-compile vars or setting CROSS_COMPILE as a prefix. $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,AR,$(CROSS_COMPILE)ar) $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +endif + +CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) ifneq ($(LLVM),) HOSTAR ?= llvm-ar @@ -86,7 +94,8 @@ endif # in newer systems. # Needed for the __raw_cmpxchg in tools/arch/x86/include/asm/cmpxchg.h # -# See https://lkml.org/lkml/2006/11/28/253 and https://gcc.gnu.org/gcc-4.8/changes.html, +# See https://lore.kernel.org/lkml/9a8748490611281710g78402fbeh8ff7fcc162dbcbca@mail.gmail.com/ +# and https://gcc.gnu.org/gcc-4.8/changes.html, # that takes into account Linus's comments (search for Wshadow) for the reasoning about # -Wshadow not being interesting before gcc 4.8. diff --git a/tools/spi/Makefile b/tools/spi/Makefile index ada881afb489..0aa6dbd31fb8 100644 --- a/tools/spi/Makefile +++ b/tools/spi/Makefile @@ -25,11 +25,12 @@ include $(srctree)/tools/build/Makefile.include # # We need the following to be outside of kernel tree # -$(OUTPUT)include/linux/spi/spidev.h: ../../include/uapi/linux/spi/spidev.h +$(OUTPUT)include/linux/spi: ../../include/uapi/linux/spi mkdir -p $(OUTPUT)include/linux/spi 2>&1 || true ln -sf $(CURDIR)/../../include/uapi/linux/spi/spidev.h $@ + ln -sf $(CURDIR)/../../include/uapi/linux/spi/spi.h $@ -prepare: $(OUTPUT)include/linux/spi/spidev.h +prepare: $(OUTPUT)include/linux/spi # # spidev_test diff --git a/tools/testing/kunit/configs/broken_on_uml.config b/tools/testing/kunit/configs/broken_on_uml.config index a7f0603d33f6..690870043ac0 100644 --- a/tools/testing/kunit/configs/broken_on_uml.config +++ b/tools/testing/kunit/configs/broken_on_uml.config @@ -40,3 +40,5 @@ # CONFIG_RESET_BRCMSTB_RESCAL is not set # CONFIG_RESET_INTEL_GW is not set # CONFIG_ADI_AXI_ADC is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_POISONING is not set diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index d5144fcb03ac..5da8fb3762f9 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -184,7 +184,9 @@ def add_common_opts(parser) -> None: help='Run all KUnit tests through allyesconfig', action='store_true') parser.add_argument('--kunitconfig', - help='Path to Kconfig fragment that enables KUnit tests', + help='Path to Kconfig fragment that enables KUnit tests.' + ' If given a directory, (e.g. lib/kunit), "/.kunitconfig" ' + 'will get automatically appended.', metavar='kunitconfig') def add_build_opts(parser) -> None: diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py index 0b550cbd667d..1e2683dcc0e7 100644 --- a/tools/testing/kunit/kunit_config.py +++ b/tools/testing/kunit/kunit_config.py @@ -13,7 +13,7 @@ from typing import List, Set CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$' CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$' -KconfigEntryBase = collections.namedtuple('KconfigEntry', ['name', 'value']) +KconfigEntryBase = collections.namedtuple('KconfigEntryBase', ['name', 'value']) class KconfigEntry(KconfigEntryBase): diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index f309a33256cd..89a7d4024e87 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -132,6 +132,8 @@ class LinuxSourceTree(object): return if kunitconfig_path: + if os.path.isdir(kunitconfig_path): + kunitconfig_path = os.path.join(kunitconfig_path, KUNITCONFIG_PATH) if not os.path.exists(kunitconfig_path): raise ConfigError(f'Specified kunitconfig ({kunitconfig_path}) does not exist') else: diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 1ad3049e9069..2e809dd956a7 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -251,6 +251,12 @@ class LinuxSourceTreeTest(unittest.TestCase): with tempfile.NamedTemporaryFile('wt') as kunitconfig: tree = kunit_kernel.LinuxSourceTree('', kunitconfig_path=kunitconfig.name) + def test_dir_kunitconfig(self): + with tempfile.TemporaryDirectory('') as dir: + with open(os.path.join(dir, '.kunitconfig'), 'w') as f: + pass + tree = kunit_kernel.LinuxSourceTree('', kunitconfig_path=dir) + # TODO: add more test cases. diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index 3b796dd5e577..ca24f6839d50 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -296,21 +296,34 @@ static void *idr_throbber(void *arg) return NULL; } +/* + * There are always either 1 or 2 objects in the IDR. If we find nothing, + * or we find something at an ID we didn't expect, that's a bug. + */ void idr_find_test_1(int anchor_id, int throbber_id) { pthread_t throbber; time_t start = time(NULL); - pthread_create(&throbber, NULL, idr_throbber, &throbber_id); - BUG_ON(idr_alloc(&find_idr, xa_mk_value(anchor_id), anchor_id, anchor_id + 1, GFP_KERNEL) != anchor_id); + pthread_create(&throbber, NULL, idr_throbber, &throbber_id); + + rcu_read_lock(); do { int id = 0; void *entry = idr_get_next(&find_idr, &id); - BUG_ON(entry != xa_mk_value(id)); + rcu_read_unlock(); + if ((id != anchor_id && id != throbber_id) || + entry != xa_mk_value(id)) { + printf("%s(%d, %d): %p at %d\n", __func__, anchor_id, + throbber_id, entry, id); + abort(); + } + rcu_read_lock(); } while (time(NULL) < start + 11); + rcu_read_unlock(); pthread_join(throbber, NULL); @@ -577,6 +590,7 @@ void ida_tests(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); idr_checks(); ida_tests(); @@ -584,5 +598,6 @@ int __weak main(void) rcu_barrier(); if (nr_allocated) printf("nr_allocated = %d\n", nr_allocated); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/radix-tree/linux/compiler_types.h b/tools/testing/radix-tree/linux/compiler_types.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/tools/testing/radix-tree/linux/compiler_types.h +++ /dev/null diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 9eae0fb5a67d..e00520cc6349 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -224,7 +224,9 @@ void multiorder_checks(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); multiorder_checks(); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index e61e43efe463..f20e12cbbfd4 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -25,11 +25,13 @@ void xarray_tests(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); xarray_tests(); radix_tree_cpu_dead(1); rcu_barrier(); if (nr_allocated) printf("nr_allocated = %d\n", nr_allocated); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 2c9d012797a7..ced910fb4019 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal pauth fp mte +ARM64_SUBTARGETS ?= tags signal pauth fp mte bti else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/bti/.gitignore b/tools/testing/selftests/arm64/bti/.gitignore new file mode 100644 index 000000000000..73869fabada4 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/.gitignore @@ -0,0 +1,2 @@ +btitest +nobtitest diff --git a/tools/testing/selftests/arm64/bti/Makefile b/tools/testing/selftests/arm64/bti/Makefile new file mode 100644 index 000000000000..73e013c082a6 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/Makefile @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_GEN_PROGS := btitest nobtitest + +PROGS := $(patsubst %,gen/%,$(TEST_GEN_PROGS)) + +# These tests are built as freestanding binaries since otherwise BTI +# support in ld.so is required which is not currently widespread; when +# it is available it will still be useful to test this separately as the +# cases for statically linked and dynamically lined binaries are +# slightly different. + +CFLAGS_NOBTI = -DBTI=0 +CFLAGS_BTI = -mbranch-protection=standard -DBTI=1 + +CFLAGS_COMMON = -ffreestanding -Wall -Wextra $(CFLAGS) + +BTI_CC_COMMAND = $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -c -o $@ $< +NOBTI_CC_COMMAND = $(CC) $(CFLAGS_NOBTI) $(CFLAGS_COMMON) -c -o $@ $< + +%-bti.o: %.c + $(BTI_CC_COMMAND) + +%-bti.o: %.S + $(BTI_CC_COMMAND) + +%-nobti.o: %.c + $(NOBTI_CC_COMMAND) + +%-nobti.o: %.S + $(NOBTI_CC_COMMAND) + +BTI_OBJS = \ + test-bti.o \ + signal-bti.o \ + start-bti.o \ + syscall-bti.o \ + system-bti.o \ + teststubs-bti.o \ + trampoline-bti.o +gen/btitest: $(BTI_OBJS) + $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -nostdlib -o $@ $^ + +NOBTI_OBJS = \ + test-nobti.o \ + signal-nobti.o \ + start-nobti.o \ + syscall-nobti.o \ + system-nobti.o \ + teststubs-nobti.o \ + trampoline-nobti.o +gen/nobtitest: $(NOBTI_OBJS) + $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -nostdlib -o $@ $^ + +# Including KSFT lib.mk here will also mangle the TEST_GEN_PROGS list +# to account for any OUTPUT target-dirs optionally provided by +# the toplevel makefile +include ../../lib.mk + +$(TEST_GEN_PROGS): $(PROGS) + cp $(PROGS) $(OUTPUT)/ diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h new file mode 100644 index 000000000000..04e7b72880ef --- /dev/null +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#ifndef ASSEMBLER_H +#define ASSEMBLER_H + +#define NT_GNU_PROPERTY_TYPE_0 5 +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) + + +.macro startfn name:req + .globl \name +\name: + .macro endfn + .size \name, . - \name + .type \name, @function + .purgem endfn + .endm +.endm + +.macro emit_aarch64_feature_1_and + .pushsection .note.gnu.property, "a" + .align 3 + .long 2f - 1f + .long 6f - 3f + .long NT_GNU_PROPERTY_TYPE_0 +1: .string "GNU" +2: + .align 3 +3: .long GNU_PROPERTY_AARCH64_FEATURE_1_AND + .long 5f - 4f +4: +#if BTI + .long GNU_PROPERTY_AARCH64_FEATURE_1_PAC | \ + GNU_PROPERTY_AARCH64_FEATURE_1_BTI +#else + .long 0 +#endif +5: + .align 3 +6: + .popsection +.endm + +.macro paciasp + hint 0x19 +.endm + +.macro autiasp + hint 0x1d +.endm + +.macro __bti_ + hint 0x20 +.endm + +.macro __bti_c + hint 0x22 +.endm + +.macro __bti_j + hint 0x24 +.endm + +.macro __bti_jc + hint 0x26 +.endm + +.macro bti what= + __bti_\what +.endm + +#endif /* ! ASSEMBLER_H */ diff --git a/tools/testing/selftests/arm64/bti/btitest.h b/tools/testing/selftests/arm64/bti/btitest.h new file mode 100644 index 000000000000..2aff9b10336e --- /dev/null +++ b/tools/testing/selftests/arm64/bti/btitest.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#ifndef BTITEST_H +#define BTITEST_H + +/* Trampolines for calling the test stubs: */ +void call_using_br_x0(void (*)(void)); +void call_using_br_x16(void (*)(void)); +void call_using_blr(void (*)(void)); + +/* Test stubs: */ +void nohint_func(void); +void bti_none_func(void); +void bti_c_func(void); +void bti_j_func(void); +void bti_jc_func(void); +void paciasp_func(void); + +#endif /* !BTITEST_H */ diff --git a/tools/testing/selftests/arm64/bti/compiler.h b/tools/testing/selftests/arm64/bti/compiler.h new file mode 100644 index 000000000000..ebb6204f447a --- /dev/null +++ b/tools/testing/selftests/arm64/bti/compiler.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#ifndef COMPILER_H +#define COMPILER_H + +#define __always_unused __attribute__((__unused__)) +#define __noreturn __attribute__((__noreturn__)) +#define __unreachable() __builtin_unreachable() + +/* curse(e) has value e, but the compiler cannot assume so */ +#define curse(e) ({ \ + __typeof__(e) __curse_e = (e); \ + asm ("" : "+r" (__curse_e)); \ + __curse_e; \ +}) + +#endif /* ! COMPILER_H */ diff --git a/tools/testing/selftests/arm64/bti/gen/.gitignore b/tools/testing/selftests/arm64/bti/gen/.gitignore new file mode 100644 index 000000000000..73869fabada4 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/gen/.gitignore @@ -0,0 +1,2 @@ +btitest +nobtitest diff --git a/tools/testing/selftests/arm64/bti/signal.c b/tools/testing/selftests/arm64/bti/signal.c new file mode 100644 index 000000000000..f3fd29b91141 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/signal.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "system.h" +#include "signal.h" + +int sigemptyset(sigset_t *s) +{ + unsigned int i; + + for (i = 0; i < _NSIG_WORDS; ++i) + s->sig[i] = 0; + + return 0; +} + +int sigaddset(sigset_t *s, int n) +{ + if (n < 1 || n > _NSIG) + return -EINVAL; + + s->sig[(n - 1) / _NSIG_BPW] |= 1UL << (n - 1) % _NSIG_BPW; + return 0; +} + +int sigaction(int n, struct sigaction *sa, const struct sigaction *old) +{ + return syscall(__NR_rt_sigaction, n, sa, old, sizeof(sa->sa_mask)); +} + +int sigprocmask(int how, const sigset_t *mask, sigset_t *old) +{ + return syscall(__NR_rt_sigprocmask, how, mask, old, sizeof(*mask)); +} diff --git a/tools/testing/selftests/arm64/bti/signal.h b/tools/testing/selftests/arm64/bti/signal.h new file mode 100644 index 000000000000..103457dc880e --- /dev/null +++ b/tools/testing/selftests/arm64/bti/signal.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#ifndef SIGNAL_H +#define SIGNAL_H + +#include <linux/signal.h> + +#include "system.h" + +typedef __sighandler_t sighandler_t; + +int sigemptyset(sigset_t *s); +int sigaddset(sigset_t *s, int n); +int sigaction(int n, struct sigaction *sa, const struct sigaction *old); +int sigprocmask(int how, const sigset_t *mask, sigset_t *old); + +#endif /* ! SIGNAL_H */ diff --git a/tools/testing/selftests/arm64/bti/start.S b/tools/testing/selftests/arm64/bti/start.S new file mode 100644 index 000000000000..831f952e0572 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/start.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "assembler.h" + +startfn _start + mov x0, sp + b start +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/bti/syscall.S b/tools/testing/selftests/arm64/bti/syscall.S new file mode 100644 index 000000000000..8dde8b6f3db1 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/syscall.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "assembler.h" + +startfn syscall + bti c + mov w8, w0 + mov x0, x1 + mov x1, x2 + mov x2, x3 + mov x3, x4 + mov x4, x5 + mov x5, x6 + mov x6, x7 + svc #0 + ret +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/bti/system.c b/tools/testing/selftests/arm64/bti/system.c new file mode 100644 index 000000000000..6385d8d4973b --- /dev/null +++ b/tools/testing/selftests/arm64/bti/system.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "system.h" + +#include <asm/unistd.h> + +#include "compiler.h" + +void __noreturn exit(int n) +{ + syscall(__NR_exit, n); + __unreachable(); +} + +ssize_t write(int fd, const void *buf, size_t size) +{ + return syscall(__NR_write, fd, buf, size); +} diff --git a/tools/testing/selftests/arm64/bti/system.h b/tools/testing/selftests/arm64/bti/system.h new file mode 100644 index 000000000000..aca118589705 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/system.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#ifndef SYSTEM_H +#define SYSTEM_H + +#include <linux/types.h> +#include <linux/stddef.h> + +typedef __kernel_size_t size_t; +typedef __kernel_ssize_t ssize_t; + +#include <linux/errno.h> +#include <asm/hwcap.h> +#include <asm/ptrace.h> +#include <asm/unistd.h> + +#include "compiler.h" + +long syscall(int nr, ...); + +void __noreturn exit(int n); +ssize_t write(int fd, const void *buf, size_t size); + +#endif /* ! SYSTEM_H */ diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c new file mode 100644 index 000000000000..656b04976ccc --- /dev/null +++ b/tools/testing/selftests/arm64/bti/test.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019,2021 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "system.h" + +#include <linux/errno.h> +#include <linux/auxvec.h> +#include <linux/signal.h> +#include <asm/sigcontext.h> +#include <asm/ucontext.h> + +typedef struct ucontext ucontext_t; + +#include "btitest.h" +#include "compiler.h" +#include "signal.h" + +#define EXPECTED_TESTS 18 + +static volatile unsigned int test_num = 1; +static unsigned int test_passed; +static unsigned int test_failed; +static unsigned int test_skipped; + +static void fdputs(int fd, const char *str) +{ + size_t len = 0; + const char *p = str; + + while (*p++) + ++len; + + write(fd, str, len); +} + +static void putstr(const char *str) +{ + fdputs(1, str); +} + +static void putnum(unsigned int num) +{ + char c; + + if (num / 10) + putnum(num / 10); + + c = '0' + (num % 10); + write(1, &c, 1); +} + +#define puttestname(test_name, trampoline_name) do { \ + putstr(test_name); \ + putstr("/"); \ + putstr(trampoline_name); \ +} while (0) + +void print_summary(void) +{ + putstr("# Totals: pass:"); + putnum(test_passed); + putstr(" fail:"); + putnum(test_failed); + putstr(" xfail:0 xpass:0 skip:"); + putnum(test_skipped); + putstr(" error:0\n"); +} + +static const char *volatile current_test_name; +static const char *volatile current_trampoline_name; +static volatile int sigill_expected, sigill_received; + +static void handler(int n, siginfo_t *si __always_unused, + void *uc_ __always_unused) +{ + ucontext_t *uc = uc_; + + putstr("# \t[SIGILL in "); + puttestname(current_test_name, current_trampoline_name); + putstr(", BTYPE="); + write(1, &"00011011"[((uc->uc_mcontext.pstate & PSR_BTYPE_MASK) + >> PSR_BTYPE_SHIFT) * 2], 2); + if (!sigill_expected) { + putstr("]\n"); + putstr("not ok "); + putnum(test_num); + putstr(" "); + puttestname(current_test_name, current_trampoline_name); + putstr("(unexpected SIGILL)\n"); + print_summary(); + exit(128 + n); + } + + putstr(" (expected)]\n"); + sigill_received = 1; + /* zap BTYPE so that resuming the faulting code will work */ + uc->uc_mcontext.pstate &= ~PSR_BTYPE_MASK; +} + +static int skip_all; + +static void __do_test(void (*trampoline)(void (*)(void)), + void (*fn)(void), + const char *trampoline_name, + const char *name, + int expect_sigill) +{ + if (skip_all) { + test_skipped++; + putstr("ok "); + putnum(test_num); + putstr(" "); + puttestname(name, trampoline_name); + putstr(" # SKIP\n"); + + return; + } + + /* Branch Target exceptions should only happen in BTI binaries: */ + if (!BTI) + expect_sigill = 0; + + sigill_expected = expect_sigill; + sigill_received = 0; + current_test_name = name; + current_trampoline_name = trampoline_name; + + trampoline(fn); + + if (expect_sigill && !sigill_received) { + putstr("not ok "); + test_failed++; + } else { + putstr("ok "); + test_passed++; + } + putnum(test_num++); + putstr(" "); + puttestname(name, trampoline_name); + putstr("\n"); +} + +#define do_test(expect_sigill_br_x0, \ + expect_sigill_br_x16, \ + expect_sigill_blr, \ + name) \ +do { \ + __do_test(call_using_br_x0, name, "call_using_br_x0", #name, \ + expect_sigill_br_x0); \ + __do_test(call_using_br_x16, name, "call_using_br_x16", #name, \ + expect_sigill_br_x16); \ + __do_test(call_using_blr, name, "call_using_blr", #name, \ + expect_sigill_blr); \ +} while (0) + +void start(int *argcp) +{ + struct sigaction sa; + void *const *p; + const struct auxv_entry { + unsigned long type; + unsigned long val; + } *auxv; + unsigned long hwcap = 0, hwcap2 = 0; + + putstr("TAP version 13\n"); + putstr("1.."); + putnum(EXPECTED_TESTS); + putstr("\n"); + + /* Gross hack for finding AT_HWCAP2 from the initial process stack: */ + p = (void *const *)argcp + 1 + *argcp + 1; /* start of environment */ + /* step over environment */ + while (*p++) + ; + for (auxv = (const struct auxv_entry *)p; auxv->type != AT_NULL; ++auxv) { + switch (auxv->type) { + case AT_HWCAP: + hwcap = auxv->val; + break; + case AT_HWCAP2: + hwcap2 = auxv->val; + break; + default: + break; + } + } + + if (hwcap & HWCAP_PACA) + putstr("# HWCAP_PACA present\n"); + else + putstr("# HWCAP_PACA not present\n"); + + if (hwcap2 & HWCAP2_BTI) { + putstr("# HWCAP2_BTI present\n"); + if (!(hwcap & HWCAP_PACA)) + putstr("# Bad hardware? Expect problems.\n"); + } else { + putstr("# HWCAP2_BTI not present\n"); + skip_all = 1; + } + + putstr("# Test binary"); + if (!BTI) + putstr(" not"); + putstr(" built for BTI\n"); + + sa.sa_handler = (sighandler_t)(void *)handler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sigaction(SIGILL, &sa, NULL); + sigaddset(&sa.sa_mask, SIGILL); + sigprocmask(SIG_UNBLOCK, &sa.sa_mask, NULL); + + do_test(1, 1, 1, nohint_func); + do_test(1, 1, 1, bti_none_func); + do_test(1, 0, 0, bti_c_func); + do_test(0, 0, 1, bti_j_func); + do_test(0, 0, 0, bti_jc_func); + do_test(1, 0, 0, paciasp_func); + + print_summary(); + + if (test_num - 1 != EXPECTED_TESTS) + putstr("# WARNING - EXPECTED TEST COUNT WRONG\n"); + + if (test_failed) + exit(1); + else + exit(0); +} diff --git a/tools/testing/selftests/arm64/bti/teststubs.S b/tools/testing/selftests/arm64/bti/teststubs.S new file mode 100644 index 000000000000..b62c8c35f67e --- /dev/null +++ b/tools/testing/selftests/arm64/bti/teststubs.S @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "assembler.h" + +startfn bti_none_func + bti + ret +endfn + +startfn bti_c_func + bti c + ret +endfn + +startfn bti_j_func + bti j + ret +endfn + +startfn bti_jc_func + bti jc + ret +endfn + +startfn paciasp_func + paciasp + autiasp + ret +endfn + +startfn nohint_func + ret +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/bti/trampoline.S b/tools/testing/selftests/arm64/bti/trampoline.S new file mode 100644 index 000000000000..09beb3f361f1 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/trampoline.S @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin <Dave.Martin@arm.com> + */ + +#include "assembler.h" + +startfn call_using_br_x0 + bti c + br x0 +endfn + +startfn call_using_br_x16 + bti c + mov x16, x0 + br x16 +endfn + +startfn call_using_blr + paciasp + stp x29, x30, [sp, #-16]! + blr x0 + ldp x29, x30, [sp], #16 + autiasp + ret +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index 9210691aa998..e3e08d9c7020 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -284,16 +284,28 @@ endfunction // Set up test pattern in the FFR // x0: pid // x2: generation +// +// We need to generate a canonical FFR value, which consists of a number of +// low "1" bits, followed by a number of zeros. This gives us 17 unique values +// per 16 bits of FFR, so we create a 4 bit signature out of the PID and +// generation, and use that as the initial number of ones in the pattern. +// We fill the upper lanes of FFR with zeros. // Beware: corrupts P0. function setup_ffr mov x4, x30 - bl pattern + and w0, w0, #0x3 + bfi w0, w2, #2, #2 + mov w1, #1 + lsl w1, w1, w0 + sub w1, w1, #1 + ldr x0, =ffrref - ldr x1, =scratch - rdvl x2, #1 - lsr x2, x2, #3 - bl memcpy + strh w1, [x0], 2 + rdvl x1, #1 + lsr x1, x1, #3 + sub x1, x1, #2 + bl memclr mov x0, #0 ldr x1, =ffrref diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index 0b3af552632a..409e3e53d00a 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -1,14 +1,18 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (C) 2020 ARM Limited -CFLAGS += -std=gnu99 -I. -lpthread +# preserve CC value from top level Makefile +ifeq ($(CC),cc) +CC := $(CROSS_COMPILE)gcc +endif + +CFLAGS += -std=gnu99 -I. -pthread +LDFLAGS += -pthread SRCS := $(filter-out mte_common_util.c,$(wildcard *.c)) PROGS := $(patsubst %.c,%,$(SRCS)) #Add mte compiler option -ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep gcc),) CFLAGS += -march=armv8.5-a+memtag -endif #check if the compiler works well mte_cc_support := $(shell if ($(CC) $(CFLAGS) -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi) @@ -19,11 +23,14 @@ TEST_GEN_PROGS := $(PROGS) # Get Kernel headers installed and use them. KSFT_KHDR_INSTALL := 1 +else + $(warning compiler "$(CC)" does not support the ARMv8.5 MTE extension.) + $(warning test program "mte" will not be created.) endif # Include KSFT lib.mk. include ../../lib.mk ifeq ($(mte_cc_support),1) -$(TEST_GEN_PROGS): mte_common_util.c mte_common_util.h mte_helper.S +$(TEST_GEN_PROGS): mte_common_util.c mte_helper.S endif diff --git a/tools/testing/selftests/arm64/mte/check_ksm_options.c b/tools/testing/selftests/arm64/mte/check_ksm_options.c index 3b23c4d61d38..88c74bc46d4f 100644 --- a/tools/testing/selftests/arm64/mte/check_ksm_options.c +++ b/tools/testing/selftests/arm64/mte/check_ksm_options.c @@ -33,7 +33,10 @@ static unsigned long read_sysfs(char *str) ksft_print_msg("ERR: missing %s\n", str); return 0; } - fscanf(f, "%lu", &val); + if (fscanf(f, "%lu", &val) != 1) { + ksft_print_msg("ERR: parsing %s\n", str); + val = 0; + } fclose(f); return val; } diff --git a/tools/testing/selftests/arm64/mte/check_user_mem.c b/tools/testing/selftests/arm64/mte/check_user_mem.c index 4bfa80f2a8c3..1de7a0abd0ae 100644 --- a/tools/testing/selftests/arm64/mte/check_user_mem.c +++ b/tools/testing/selftests/arm64/mte/check_user_mem.c @@ -33,7 +33,8 @@ static int check_usermem_access_fault(int mem_type, int mode, int mapping) if (fd == -1) return KSFT_FAIL; for (i = 0; i < len; i++) - write(fd, &val, sizeof(val)); + if (write(fd, &val, sizeof(val)) != sizeof(val)) + return KSFT_FAIL; lseek(fd, 0, 0); ptr = mte_allocate_memory(len, mem_type, mapping, true); if (check_allocated_memory(ptr, len, mem_type, true) != KSFT_PASS) { diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 39f8908988ea..f50ac31920d1 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -181,10 +181,17 @@ void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags } /* Initialize the file for mappable size */ lseek(fd, 0, SEEK_SET); - for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) - write(fd, buffer, INIT_BUFFER_SIZE); + for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) { + if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) { + perror("initialising buffer"); + return NULL; + } + } index -= INIT_BUFFER_SIZE; - write(fd, buffer, size - index); + if (write(fd, buffer, size - index) != size - index) { + perror("initialising buffer"); + return NULL; + } return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd); } @@ -202,9 +209,15 @@ void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping, /* Initialize the file for mappable size */ lseek(fd, 0, SEEK_SET); for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE) - write(fd, buffer, INIT_BUFFER_SIZE); + if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) { + perror("initialising buffer"); + return NULL; + } index -= INIT_BUFFER_SIZE; - write(fd, buffer, map_size - index); + if (write(fd, buffer, map_size - index) != map_size - index) { + perror("initialising buffer"); + return NULL; + } return __mte_allocate_memory_range(size, mem_type, mapping, range_before, range_after, true, fd); } @@ -271,29 +284,20 @@ int mte_switch_mode(int mte_option, unsigned long incl_mask) en |= (incl_mask << PR_MTE_TAG_SHIFT); /* Enable address tagging ABI, mte error reporting mode and tag inclusion mask. */ - if (!prctl(PR_SET_TAGGED_ADDR_CTRL, en, 0, 0, 0) == 0) { + if (prctl(PR_SET_TAGGED_ADDR_CTRL, en, 0, 0, 0) != 0) { ksft_print_msg("FAIL:prctl PR_SET_TAGGED_ADDR_CTRL for mte mode\n"); return -EINVAL; } return 0; } -#define ID_AA64PFR1_MTE_SHIFT 8 -#define ID_AA64PFR1_MTE 2 - int mte_default_setup(void) { - unsigned long hwcaps = getauxval(AT_HWCAP); + unsigned long hwcaps2 = getauxval(AT_HWCAP2); unsigned long en = 0; int ret; - if (!(hwcaps & HWCAP_CPUID)) { - ksft_print_msg("FAIL: CPUID registers unavailable\n"); - return KSFT_FAIL; - } - /* Read ID_AA64PFR1_EL1 register */ - asm volatile("mrs %0, id_aa64pfr1_el1" : "=r"(hwcaps) : : "memory"); - if (((hwcaps >> ID_AA64PFR1_MTE_SHIFT) & MT_TAG_MASK) != ID_AA64PFR1_MTE) { + if (!(hwcaps2 & HWCAP2_MTE)) { ksft_print_msg("FAIL: MTE features unavailable\n"); return KSFT_SKIP; } @@ -333,6 +337,7 @@ int create_temp_file(void) /* Create a file in the tmpfs filesystem */ fd = mkstemp(&filename[0]); if (fd == -1) { + perror(filename); ksft_print_msg("FAIL: Unable to open temporary file\n"); return 0; } diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c0c48fdb9ac1..4866f6a21901 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +bpf-helpers* +bpf-syscall* test_verifier test_maps test_lru_map diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 044bfdcf5b74..511259c2c6c5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -include ../../../../scripts/Kbuild.include +include ../../../build/Build.include include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include @@ -21,13 +21,18 @@ endif BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ +CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup \ @@ -68,6 +73,7 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ + test_doc_build.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ @@ -103,6 +109,7 @@ override define CLEAN $(call msg,CLEAN) $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) $(Q)$(MAKE) -C bpf_testmod clean + $(Q)$(MAKE) docs-clean endef include ../lib.mk @@ -198,18 +205,27 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install - $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation - $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ - -C $(BPFTOOLDIR)/Documentation \ - OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \ - prefix= DESTDIR=$(SCRATCH_DIR)/ install + +all: docs + +docs: + $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ + +docs-clean: + $(Q)$(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -O0' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) @@ -217,11 +233,12 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ - OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -O0' \ + OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) | $(BPFTOOL) $(INCLUDE_DIR) +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ifeq ($(VMLINUX_H),) $(call msg,GEN,,$@) $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ @@ -292,6 +309,16 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c +LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ + linked_vars.skel.h linked_maps.skel.h + +test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o +linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o +linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o +linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o + +LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -310,8 +337,9 @@ TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ - $$(filter-out $(SKEL_BLACKLIST), \ + $$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\ $$(TRUNNER_BPF_SRCS))) +TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined @@ -344,11 +372,22 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) -$(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ - $(TRUNNER_OUTPUT)/%.o \ - | $(BPFTOOL) $(TRUNNER_OUTPUT) +$(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen skeleton $$< > $$@ + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) + $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) + $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@ + +$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.o)) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) + $(Q)diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) + $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ endif # ensure we set up tests.h header generation rule just once @@ -370,6 +409,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_OBJS) \ $(TRUNNER_BPF_SKELS) \ + $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) @@ -382,11 +422,12 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ -# only copy extra resources if in flavored build +# non-flavored in-srctree builds receive special treatment, in particular, we +# do not need to copy extra resources (see e.g. test_btf_dump_case()) $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) -ifneq ($2,) +ifneq ($2:$(OUTPUT),:$(shell pwd)) $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) - $(Q)cp -a $$^ $(TRUNNER_OUTPUT)/ + $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ @@ -452,7 +493,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h @@ -476,3 +517,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) + +.PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs new file mode 100644 index 000000000000..ccf260021e83 --- /dev/null +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0-only + +include ../../../scripts/Makefile.include +include ../../../scripts/utilities.mak + +INSTALL ?= install +RM ?= rm -f +RMDIR ?= rmdir --ignore-fail-on-non-empty + +ifeq ($(V),1) + Q = +else + Q = @ +endif + +prefix ?= /usr/local +mandir ?= $(prefix)/man +man2dir = $(mandir)/man2 +man7dir = $(mandir)/man7 + +SYSCALL_RST = bpf-syscall.rst +MAN2_RST = $(SYSCALL_RST) + +HELPERS_RST = bpf-helpers.rst +MAN7_RST = $(HELPERS_RST) + +_DOC_MAN2 = $(patsubst %.rst,%.2,$(MAN2_RST)) +DOC_MAN2 = $(addprefix $(OUTPUT),$(_DOC_MAN2)) + +_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) +DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) + +DOCTARGETS := helpers syscall + +docs: $(DOCTARGETS) +syscall: man2 +helpers: man7 +man2: $(DOC_MAN2) +man7: $(DOC_MAN7) + +RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) + +# Configure make rules for the man page bpf-$1.$2. +# $1 - target for scripts/bpf_doc.py +# $2 - man page section to generate the troff file +define DOCS_RULES = +$(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h + $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \ + --filename $$< > $$@ + +$(OUTPUT)%.$2: $(OUTPUT)%.rst +ifndef RST2MAN_DEP + $$(error "rst2man not found, but required to generate man pages") +endif + $$(QUIET_GEN)rst2man $$< > $$@ + +docs-clean-$1: + $$(call QUIET_CLEAN, eBPF_$1-manpage) + $(Q)$(RM) $$(DOC_MAN$2) $(OUTPUT)bpf-$1.rst + +docs-install-$1: docs + $$(call QUIET_INSTALL, eBPF_$1-manpage) + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$$(man$2dir) + $(Q)$(INSTALL) -m 644 $$(DOC_MAN$2) $(DESTDIR)$$(man$2dir) + +docs-uninstall-$1: + $$(call QUIET_UNINST, eBPF_$1-manpage) + $(Q)$(RM) $$(addprefix $(DESTDIR)$$(man$2dir)/,$$(_DOC_MAN$2)) + $(Q)$(RMDIR) $(DESTDIR)$$(man$2dir) + +.PHONY: $1 docs-clean-$1 docs-install-$1 docs-uninstall-$1 +endef + +# Create the make targets to generate manual pages by name and section +$(eval $(call DOCS_RULES,helpers,7)) +$(eval $(call DOCS_RULES,syscall,2)) + +docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget)) +docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget)) +docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget)) + +.PHONY: docs docs-clean docs-install docs-uninstall man2 man7 diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index fd148b8410fa..3353778c30f8 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -111,6 +111,45 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk. __ https://reviews.llvm.org/D78466 +bpf_verif_scale/loop6.o test failure with Clang 12 +================================================== + +With Clang 12, the following bpf_verif_scale test failed: + * ``bpf_verif_scale/loop6.o`` + +The verifier output looks like + +.. code-block:: c + + R1 type=ctx expected=fp + The sequence of 8193 jumps is too complex. + +The reason is compiler generating the following code + +.. code-block:: c + + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 14: 16 05 40 00 00 00 00 00 if w5 == 0 goto +64 <LBB0_6> + 15: bc 51 00 00 00 00 00 00 w1 = w5 + 16: 04 01 00 00 ff ff ff ff w1 += -1 + 17: 67 05 00 00 20 00 00 00 r5 <<= 32 + 18: 77 05 00 00 20 00 00 00 r5 >>= 32 + 19: a6 01 01 00 05 00 00 00 if w1 < 5 goto +1 <LBB0_4> + 20: b7 05 00 00 06 00 00 00 r5 = 6 + 00000000000000a8 <LBB0_4>: + 21: b7 02 00 00 00 00 00 00 r2 = 0 + 22: b7 01 00 00 00 00 00 00 r1 = 0 + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 23: 7b 1a e0 ff 00 00 00 00 *(u64 *)(r10 - 32) = r1 + 24: 7b 5a c0 ff 00 00 00 00 *(u64 *)(r10 - 64) = r5 + +Note that insn #15 has w1 = w5 and w1 is refined later but +r5(w5) is eventually saved on stack at insn #24 for later use. +This cause later verifier failure. The bug has been `fixed`__ in +Clang 13. + +__ https://reviews.llvm.org/D97479 + BPF CO-RE-based tests and Clang version ======================================= @@ -131,3 +170,35 @@ failures: .. _2: https://reviews.llvm.org/D85174 .. _3: https://reviews.llvm.org/D83878 .. _4: https://reviews.llvm.org/D83242 + +Floating-point tests and Clang version +====================================== + +Certain selftests, e.g. core_reloc, require support for the floating-point +types, which was introduced in `Clang 13`__. The older Clang versions will +either crash when compiling these tests, or generate an incorrect BTF. + +__ https://reviews.llvm.org/D83289 + +Kernel function call test and Clang version +=========================================== + +Some selftests (e.g. kfunc_call and bpf_tcp_ca) require a LLVM support +to generate extern function in BTF. It was introduced in `Clang 13`__. + +Without it, the error from compiling bpf selftests looks like: + +.. code-block:: console + + libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 + +__ https://reviews.llvm.org/D93563 + +Clang dependencies for static linking tests +=========================================== + +linked_vars, linked_maps, and linked_funcs tests depend on `Clang fix`__ to +generate valid BTF information for weak variables. Please make sure you use +Clang that contains the fix. + +__ https://reviews.llvm.org/D100362 diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 91f0fac632f4..029589c008c9 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -187,16 +187,6 @@ struct tcp_congestion_ops { typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) -static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) -{ - __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - - acked -= cwnd - tp->snd_cwnd; - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - - return acked; -} - static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tp->snd_cwnd < tp->snd_ssthresh; @@ -213,22 +203,7 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); } -static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) -{ - /* If credits accumulated at a higher w, apply them gently now. */ - if (tp->snd_cwnd_cnt >= w) { - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd++; - } - - tp->snd_cwnd_cnt += acked; - if (tp->snd_cwnd_cnt >= w) { - __u32 delta = tp->snd_cwnd_cnt / w; - - tp->snd_cwnd_cnt -= delta * w; - tp->snd_cwnd += delta; - } - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); -} +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #endif diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c index 48f90490f922..b692e6ead9b5 100644 --- a/tools/testing/selftests/bpf/btf_helpers.c +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -23,6 +23,7 @@ static const char * const btf_kind_str_mapping[] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_kind_str(__u16 kind) @@ -173,6 +174,9 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) } break; } + case BTF_KIND_FLOAT: + fprintf(out, " size=%u", t->size); + break; default: break; } diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 37e1f303fc11..5192305159ec 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -44,3 +44,5 @@ CONFIG_SECURITYFS=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_READ_POLICY=y CONFIG_BLK_DEV_LOOP=y +CONFIG_FUNCTION_TRACER=y +CONFIG_DYNAMIC_FTRACE=y diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c index b8d6aef99db4..99628e1a1e58 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c @@ -57,6 +57,10 @@ int main(int argc, char **argv) __u32 key = 0, pid; int exit_code = 1; char buf[256]; + const struct timespec req = { + .tv_sec = 1, + .tv_nsec = 0, + }; cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno)) @@ -115,7 +119,7 @@ int main(int argc, char **argv) goto close_pmu; /* trigger some syscalls */ - sleep(1); + syscall(__NR_nanosleep, &req, NULL); err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid); if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno)) diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c index f0a64d8ac59a..f4d870da7684 100644 --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -9,10 +9,13 @@ #include <test_maps.h> +static int nr_cpus; + static void map_batch_update(int map_fd, __u32 max_entries, int *keys, - int *values) + __s64 *values, bool is_pcpu) { - int i, err; + int i, j, err; + int cpu_offset = 0; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, @@ -20,22 +23,41 @@ static void map_batch_update(int map_fd, __u32 max_entries, int *keys, for (i = 0; i < max_entries; i++) { keys[i] = i; - values[i] = i + 1; + if (is_pcpu) { + cpu_offset = i * nr_cpus; + for (j = 0; j < nr_cpus; j++) + (values + cpu_offset)[j] = i + 1 + j; + } else { + values[i] = i + 1; + } } err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); } -static void map_batch_verify(int *visited, __u32 max_entries, - int *keys, int *values) +static void map_batch_verify(int *visited, __u32 max_entries, int *keys, + __s64 *values, bool is_pcpu) { - int i; + int i, j; + int cpu_offset = 0; memset(visited, 0, max_entries * sizeof(*visited)); for (i = 0; i < max_entries; i++) { - CHECK(keys[i] + 1 != values[i], "key/value checking", - "error: i %d key %d value %d\n", i, keys[i], values[i]); + if (is_pcpu) { + cpu_offset = i * nr_cpus; + for (j = 0; j < nr_cpus; j++) { + __s64 value = (values + cpu_offset)[j]; + CHECK(keys[i] + j + 1 != value, + "key/value checking", + "error: i %d j %d key %d value %lld\n", i, + j, keys[i], value); + } + } else { + CHECK(keys[i] + 1 != values[i], "key/value checking", + "error: i %d key %d value %lld\n", i, keys[i], + values[i]); + } visited[i] = 1; } for (i = 0; i < max_entries; i++) { @@ -44,20 +66,21 @@ static void map_batch_verify(int *visited, __u32 max_entries, } } -void test_array_map_batch_ops(void) +static void __test_map_lookup_and_update_batch(bool is_pcpu) { struct bpf_create_map_attr xattr = { .name = "array_map", - .map_type = BPF_MAP_TYPE_ARRAY, + .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY : + BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), - .value_size = sizeof(int), + .value_size = sizeof(__s64), }; - int map_fd, *keys, *values, *visited; + int map_fd, *keys, *visited; __u32 count, total, total_success; const __u32 max_entries = 10; - bool nospace_err; __u64 batch = 0; - int err, step; + int err, step, value_size; + void *values; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, @@ -68,35 +91,35 @@ void test_array_map_batch_ops(void) CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", strerror(errno)); - keys = malloc(max_entries * sizeof(int)); - values = malloc(max_entries * sizeof(int)); - visited = malloc(max_entries * sizeof(int)); + value_size = sizeof(__s64); + if (is_pcpu) + value_size *= nr_cpus; + + keys = calloc(max_entries, sizeof(*keys)); + values = calloc(max_entries, value_size); + visited = calloc(max_entries, sizeof(*visited)); CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno)); - /* populate elements to the map */ - map_batch_update(map_fd, max_entries, keys, values); - /* test 1: lookup in a loop with various steps. */ total_success = 0; for (step = 1; step < max_entries; step++) { - map_batch_update(map_fd, max_entries, keys, values); - map_batch_verify(visited, max_entries, keys, values); + map_batch_update(map_fd, max_entries, keys, values, is_pcpu); + map_batch_verify(visited, max_entries, keys, values, is_pcpu); memset(keys, 0, max_entries * sizeof(*keys)); - memset(values, 0, max_entries * sizeof(*values)); + memset(values, 0, max_entries * value_size); batch = 0; total = 0; /* iteratively lookup/delete elements with 'step' * elements each. */ count = step; - nospace_err = false; while (true) { err = bpf_map_lookup_batch(map_fd, - total ? &batch : NULL, &batch, - keys + total, - values + total, - &count, &opts); + total ? &batch : NULL, + &batch, keys + total, + values + total * value_size, + &count, &opts); CHECK((err && errno != ENOENT), "lookup with steps", "error: %s\n", strerror(errno)); @@ -107,13 +130,10 @@ void test_array_map_batch_ops(void) } - if (nospace_err == true) - continue; - CHECK(total != max_entries, "lookup with steps", "total = %u, max_entries = %u\n", total, max_entries); - map_batch_verify(visited, max_entries, keys, values); + map_batch_verify(visited, max_entries, keys, values, is_pcpu); total_success++; } @@ -121,9 +141,30 @@ void test_array_map_batch_ops(void) CHECK(total_success == 0, "check total_success", "unexpected failure\n"); - printf("%s:PASS\n", __func__); - free(keys); free(values); free(visited); } + +static void array_map_batch_ops(void) +{ + __test_map_lookup_and_update_batch(false); + printf("test_%s:PASS\n", __func__); +} + +static void array_percpu_map_batch_ops(void) +{ + __test_map_lookup_and_update_batch(true); + printf("test_%s:PASS\n", __func__); +} + +void test_array_map_batch_ops(void) +{ + nr_cpus = libbpf_num_possible_cpus(); + + CHECK(nr_cpus < 0, "nr_cpus checking", + "error: get possible cpus failed"); + + array_map_batch_ops(); + array_percpu_map_batch_ops(); +} diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c new file mode 100644 index 000000000000..2e986e5e4cac --- /dev/null +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <arpa/inet.h> +#include <linux/bpf.h> +#include <netinet/in.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include <test_maps.h> + +struct test_lpm_key { + __u32 prefix; + struct in_addr ipv4; +}; + +static void map_batch_update(int map_fd, __u32 max_entries, + struct test_lpm_key *keys, int *values) +{ + __u32 i; + int err; + char buff[16] = { 0 }; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + for (i = 0; i < max_entries; i++) { + keys[i].prefix = 32; + snprintf(buff, 16, "192.168.1.%d", i + 1); + inet_pton(AF_INET, buff, &keys[i].ipv4); + values[i] = i + 1; + } + + err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); + CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); +} + +static void map_batch_verify(int *visited, __u32 max_entries, + struct test_lpm_key *keys, int *values) +{ + char buff[16] = { 0 }; + int lower_byte = 0; + __u32 i; + + memset(visited, 0, max_entries * sizeof(*visited)); + for (i = 0; i < max_entries; i++) { + inet_ntop(AF_INET, &keys[i].ipv4, buff, 32); + CHECK(sscanf(buff, "192.168.1.%d", &lower_byte) == EOF, + "sscanf()", "error: i %d\n", i); + CHECK(lower_byte != values[i], "key/value checking", + "error: i %d key %s value %d\n", i, buff, values[i]); + visited[i] = 1; + } + for (i = 0; i < max_entries; i++) { + CHECK(visited[i] != 1, "visited checking", + "error: keys array at index %d missing\n", i); + } +} + +void test_lpm_trie_map_batch_ops(void) +{ + struct bpf_create_map_attr xattr = { + .name = "lpm_trie_map", + .map_type = BPF_MAP_TYPE_LPM_TRIE, + .key_size = sizeof(struct test_lpm_key), + .value_size = sizeof(int), + .map_flags = BPF_F_NO_PREALLOC, + }; + struct test_lpm_key *keys, key; + int map_fd, *values, *visited; + __u32 step, count, total, total_success; + const __u32 max_entries = 10; + __u64 batch = 0; + int err; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + xattr.max_entries = max_entries; + map_fd = bpf_create_map_xattr(&xattr); + CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", + strerror(errno)); + + keys = malloc(max_entries * sizeof(struct test_lpm_key)); + values = malloc(max_entries * sizeof(int)); + visited = malloc(max_entries * sizeof(int)); + CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", + strerror(errno)); + + total_success = 0; + for (step = 1; step < max_entries; step++) { + map_batch_update(map_fd, max_entries, keys, values); + map_batch_verify(visited, max_entries, keys, values); + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * sizeof(*values)); + batch = 0; + total = 0; + /* iteratively lookup/delete elements with 'step' + * elements each. + */ + count = step; + while (true) { + err = bpf_map_lookup_batch(map_fd, + total ? &batch : NULL, &batch, + keys + total, values + total, &count, &opts); + + CHECK((err && errno != ENOENT), "lookup with steps", + "error: %s\n", strerror(errno)); + + total += count; + if (err) + break; + } + + CHECK(total != max_entries, "lookup with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + map_batch_verify(visited, max_entries, keys, values); + + total = 0; + count = step; + while (total < max_entries) { + if (max_entries - total < step) + count = max_entries - total; + err = bpf_map_delete_batch(map_fd, keys + total, &count, + &opts); + CHECK((err && errno != ENOENT), "delete batch", + "error: %s\n", strerror(errno)); + total += count; + if (err) + break; + } + CHECK(total != max_entries, "delete with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + /* check map is empty, errono == ENOENT */ + err = bpf_map_get_next_key(map_fd, NULL, &key); + CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", + "error: %s\n", strerror(errno)); + + total_success++; + } + + CHECK(total_success == 0, "check total_success", + "unexpected failure\n"); + + printf("%s:PASS\n", __func__); + + free(keys); + free(values); + free(visited); +} diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a0ee87c8e1ea..9dc4e3dfbcf3 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,6 +2,44 @@ #include <test_progs.h> #include "test_attach_probe.skel.h" +#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 + +#define OP_RT_RA_MASK 0xffff0000UL +#define LIS_R2 0x3c400000UL +#define ADDIS_R2_R12 0x3c4c0000UL +#define ADDI_R2_R2 0x38420000UL + +static ssize_t get_offset(ssize_t addr, ssize_t base) +{ + u32 *insn = (u32 *) addr; + + /* + * A PPC64 ABIv2 function may have a local and a global entry + * point. We need to use the local entry point when patching + * functions, so identify and step over the global entry point + * sequence. + * + * The global entry point sequence is always of the form: + * + * addis r2,r12,XXXX + * addi r2,r2,XXXX + * + * A linker optimisation may convert the addis to lis: + * + * lis r2,XXXX + * addi r2,r2,XXXX + */ + if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || + ((*insn & OP_RT_RA_MASK) == LIS_R2)) && + ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) + return (ssize_t)(insn + 2) - base; + else + return addr - base; +} +#else +#define get_offset(addr, base) (addr - base) +#endif + ssize_t get_base_addr() { size_t start, offset; char buf[256]; @@ -36,7 +74,7 @@ void test_attach_probe(void) if (CHECK(base_addr < 0, "get_base_addr", "failed to find base addr: %zd", base_addr)) return; - uprobe_offset = (size_t)&get_base_addr - base_addr; + uprobe_offset = get_offset((size_t)&get_base_addr, base_addr); skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 74c45d557a2b..2d3590cfb5e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -147,6 +147,7 @@ static void test_task_stack(void) return; do_dummy_read(skel->progs.dump_task_stack); + do_dummy_read(skel->progs.get_task_user_stacks); bpf_iter_task_stack__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 37c5494a0381..e25917f04602 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -6,6 +6,7 @@ #include <test_progs.h> #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" +#include "bpf_tcp_nogpl.skel.h" #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -227,10 +228,53 @@ static void test_dctcp(void) bpf_dctcp__destroy(dctcp_skel); } +static char *err_str; +static bool found; + +static int libbpf_debug_print(enum libbpf_print_level level, + const char *format, va_list args) +{ + char *log_buf; + + if (level != LIBBPF_WARN || + strcmp(format, "libbpf: \n%s\n")) { + vprintf(format, args); + return 0; + } + + log_buf = va_arg(args, char *); + if (!log_buf) + goto out; + if (err_str && strstr(log_buf, err_str) != NULL) + found = true; +out: + printf(format, log_buf); + return 0; +} + +static void test_invalid_license(void) +{ + libbpf_print_fn_t old_print_fn; + struct bpf_tcp_nogpl *skel; + + err_str = "struct ops programs must have a GPL compatible license"; + found = false; + old_print_fn = libbpf_set_print(libbpf_debug_print); + + skel = bpf_tcp_nogpl__open_and_load(); + ASSERT_NULL(skel, "bpf_tcp_nogpl"); + ASSERT_EQ(found, true, "expected_err_msg"); + + bpf_tcp_nogpl__destroy(skel); + libbpf_set_print(old_print_fn); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) test_dctcp(); if (test__start_subtest("cubic")) test_cubic(); + if (test__start_subtest("invalid_license")) + test_invalid_license(); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index e698ee6bb6c2..3d002c245d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -76,6 +76,7 @@ void test_bpf_verif_scale(void) { "loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "loop4.o", BPF_PROG_TYPE_SCHED_CLS }, { "loop5.o", BPF_PROG_TYPE_SCHED_CLS }, + { "loop6.o", BPF_PROG_TYPE_KPROBE }, /* partial unroll. 19k insn in a loop. * Total program size 20.8k insn. diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 6a7ee7420701..0457ae32b270 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -1903,7 +1903,7 @@ static struct btf_raw_test raw_tests[] = { .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), - BTF_TYPE_ENC(0, 0x10000000, 4), + BTF_TYPE_ENC(0, 0x20000000, 4), BTF_END_RAW, }, .str_sec = "", @@ -3531,6 +3531,136 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 1, }, +{ + .descr = "float test #1, well-formed", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [2] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 8), /* [4] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 12), /* [5] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 16), /* [6] */ + BTF_STRUCT_ENC(NAME_TBD, 5, 48), /* [7] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_MEMBER_ENC(NAME_TBD, 3, 32), + BTF_MEMBER_ENC(NAME_TBD, 4, 64), + BTF_MEMBER_ENC(NAME_TBD, 5, 128), + BTF_MEMBER_ENC(NAME_TBD, 6, 256), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0_Float16\0float\0double\0_Float80\0long_double" + "\0floats\0a\0b\0c\0d\0e"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 48, + .key_type_id = 1, + .value_type_id = 7, + .max_entries = 1, +}, +{ + .descr = "float test #2, invalid vlen", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 1), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "vlen != 0", +}, +{ + .descr = "float test #3, invalid kind_flag", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 1, 0), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid btf_info kind_flag", +}, +{ + .descr = "float test #4, member does not fit", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 2), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member exceeds struct_size", +}, +{ + .descr = "float test #5, member is not properly aligned", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 8), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member is not properly aligned", +}, +{ + .descr = "float test #6, invalid size", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 6), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 6, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid type_size", +}, + }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) @@ -6281,11 +6411,12 @@ const struct btf_dedup_test dedup_tests[] = { /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [3] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [3] */ BTF_MEMBER_ENC(NAME_NTH(3), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(4), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(5), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(6), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ @@ -6296,39 +6427,43 @@ const struct btf_dedup_test dedup_tests[] = { /* full copy of the above */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), /* [7] */ BTF_TYPE_ARRAY_ENC(7, 7, 16), /* [8] */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [9] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [9] */ BTF_MEMBER_ENC(NAME_NTH(3), 10, 0), BTF_MEMBER_ENC(NAME_NTH(4), 11, 64), BTF_MEMBER_ENC(NAME_NTH(5), 8, 128), BTF_MEMBER_ENC(NAME_NTH(6), 7, 640), + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), BTF_PTR_ENC(9), /* [10] */ BTF_PTR_ENC(12), /* [11] */ BTF_CONST_ENC(7), /* [12] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [13] */ BTF_END_RAW, }, - BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"), + BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0float\0d"), }, .expect = { .raw_types = { /* int */ - BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(6), 4, 84), /* [3] */ - BTF_MEMBER_ENC(NAME_NTH(5), 4, 0), /* struct s *next; */ + BTF_STRUCT_ENC(NAME_NTH(8), 5, 88), /* [3] */ + BTF_MEMBER_ENC(NAME_NTH(7), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(1), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(2), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(3), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(4), 7, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ BTF_PTR_ENC(6), /* [5] */ /* const -> [1] int */ BTF_CONST_ENC(1), /* [6] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [7] */ BTF_END_RAW, }, - BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"), + BTF_STR_SEC("\0a\0b\0c\0d\0int\0float\0next\0s"), }, .opts = { .dont_resolve_fwds = false, @@ -6449,9 +6584,10 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .expect = { .raw_types = { @@ -6474,16 +6610,17 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .opts = { .dont_resolve_fwds = false, }, }, { - .descr = "dedup: no int duplicates", + .descr = "dedup: no int/float duplicates", .input = { .raw_types = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8), @@ -6498,9 +6635,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .expect = { .raw_types = { @@ -6516,9 +6659,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .opts = { .dont_resolve_fwds = false, @@ -6630,6 +6779,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index c60091ee8a21..5e129dc2073c 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -77,7 +77,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) snprintf(out_file, sizeof(out_file), "/tmp/%s.output.XXXXXX", t->file); fd = mkstemp(out_file); - if (CHECK(fd < 0, "create_tmp", "failed to create file: %d\n", fd)) { + if (!ASSERT_GE(fd, 0, "create_tmp")) { err = fd; goto done; } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_endian.c b/tools/testing/selftests/bpf/prog_tests/btf_endian.c index 8c52d72c876e..8ab5d3e358dd 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_endian.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_endian.c @@ -6,8 +6,6 @@ #include <test_progs.h> #include <bpf/btf.h> -static int duration = 0; - void test_btf_endian() { #if __BYTE_ORDER == __LITTLE_ENDIAN enum btf_endianness endian = BTF_LITTLE_ENDIAN; @@ -71,7 +69,7 @@ void test_btf_endian() { /* now modify original BTF */ var_id = btf__add_var(btf, "some_var", BTF_VAR_GLOBAL_ALLOCATED, 1); - CHECK(var_id <= 0, "var_id", "failed %d\n", var_id); + ASSERT_GT(var_id, 0, "var_id"); btf__free(swap_btf); swap_btf = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c index 4d9b514b3fd9..736796e56ed1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c @@ -54,7 +54,7 @@ void test_cgroup_link(void) for (i = 0; i < cg_nr; i++) { cgs[i].fd = create_and_get_cgroup(cgs[i].path); - if (CHECK(cgs[i].fd < 0, "cg_create", "fail: %d\n", cgs[i].fd)) + if (!ASSERT_GE(cgs[i].fd, 0, "cg_create")) goto cleanup; } diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index 36af1c138faf..b62a39315336 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -128,6 +128,8 @@ static void test_check_mtu_xdp(__u32 mtu, __u32 ifindex) test_check_mtu_run_xdp(skel, skel->progs.xdp_use_helper, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_exceed_mtu, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_minus_delta, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); @@ -187,6 +189,8 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu_da, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 06eb956ff7bb..607710826dca 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -210,11 +210,6 @@ static int duration = 0; .bpf_obj_file = "test_core_reloc_existence.o", \ .btf_src_file = "btf__core_reloc_" #name ".o" \ -#define FIELD_EXISTS_ERR_CASE(name) { \ - FIELD_EXISTS_CASE_COMMON(name), \ - .fails = true, \ -} - #define BITFIELDS_CASE_COMMON(objfile, test_name_prefix, name) \ .case_name = test_name_prefix#name, \ .bpf_obj_file = objfile, \ @@ -222,7 +217,7 @@ static int duration = 0; #define BITFIELDS_CASE(name, ...) { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \ - "direct:", name), \ + "probed:", name), \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \ .input_len = sizeof(struct core_reloc_##name), \ .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ @@ -230,7 +225,7 @@ static int duration = 0; .output_len = sizeof(struct core_reloc_bitfields_output), \ }, { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ - "probed:", name), \ + "direct:", name), \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \ .input_len = sizeof(struct core_reloc_##name), \ .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ @@ -266,6 +261,7 @@ static int duration = 0; .arr_elem_sz = sizeof(((type *)0)->arr_field[0]), \ .ptr_sz = 8, /* always 8-byte pointer for BPF */ \ .enum_sz = sizeof(((type *)0)->enum_field), \ + .float_sz = sizeof(((type *)0)->float_field), \ } #define SIZE_CASE(name) { \ @@ -550,8 +546,7 @@ static struct core_reloc_test_case test_cases[] = { ARRAYS_ERR_CASE(arrays___err_too_small), ARRAYS_ERR_CASE(arrays___err_too_shallow), ARRAYS_ERR_CASE(arrays___err_non_array), - ARRAYS_ERR_CASE(arrays___err_wrong_val_type1), - ARRAYS_ERR_CASE(arrays___err_wrong_val_type2), + ARRAYS_ERR_CASE(arrays___err_wrong_val_type), ARRAYS_ERR_CASE(arrays___err_bad_zero_sz_arr), /* enum/ptr/int handling scenarios */ @@ -642,13 +637,25 @@ static struct core_reloc_test_case test_cases[] = { }, .output_len = sizeof(struct core_reloc_existence_output), }, - - FIELD_EXISTS_ERR_CASE(existence__err_int_sz), - FIELD_EXISTS_ERR_CASE(existence__err_int_type), - FIELD_EXISTS_ERR_CASE(existence__err_int_kind), - FIELD_EXISTS_ERR_CASE(existence__err_arr_kind), - FIELD_EXISTS_ERR_CASE(existence__err_arr_value_type), - FIELD_EXISTS_ERR_CASE(existence__err_struct_type), + { + FIELD_EXISTS_CASE_COMMON(existence___wrong_field_defs), + .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___wrong_field_defs) { + }, + .input_len = sizeof(struct core_reloc_existence___wrong_field_defs), + .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) { + .a_exists = 0, + .b_exists = 0, + .c_exists = 0, + .arr_exists = 0, + .s_exists = 0, + .a_value = 0xff000001u, + .b_value = 0xff000002u, + .c_value = 0xff000003u, + .arr_value = 0xff000004u, + .s_value = 0xff000005u, + }, + .output_len = sizeof(struct core_reloc_existence_output), + }, /* bitfield relocation checks */ BITFIELDS_CASE(bitfields, { @@ -857,13 +864,20 @@ void test_core_reloc(void) "prog '%s' not found\n", probe_name)) goto cleanup; + + if (test_case->btf_src_file) { + err = access(test_case->btf_src_file, R_OK); + if (!ASSERT_OK(err, "btf_src_file")) + goto cleanup; + } + load_attr.obj = obj; load_attr.log_level = 0; load_attr.target_btf_path = test_case->btf_src_file; err = bpf_object__load_xattr(&load_attr); if (err) { if (!test_case->fails) - CHECK(false, "obj_load", "failed to load prog '%s': %d\n", probe_name, err); + ASSERT_OK(err, "obj_load"); goto cleanup; } @@ -902,10 +916,8 @@ void test_core_reloc(void) goto cleanup; } - if (test_case->fails) { - CHECK(false, "obj_load_fail", "should fail to load prog '%s'\n", probe_name); + if (!ASSERT_FALSE(test_case->fails, "obj_load_should_fail")) goto cleanup; - } equal = memcmp(data->out, test_case->output, test_case->output_len) == 0; diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 04ebbf1cb390..7cb111b11995 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -3,35 +3,57 @@ #include <test_progs.h> #include "fentry_test.skel.h" -void test_fentry_test(void) +static int fentry_test(struct fentry_test *fentry_skel) { - struct fentry_test *fentry_skel = NULL; int err, prog_fd, i; __u32 duration = 0, retval; + struct bpf_link *link; __u64 *result; - fentry_skel = fentry_test__open_and_load(); - if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) - goto cleanup; - err = fentry_test__attach(fentry_skel); - if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "fentry_attach")) + return err; + + /* Check that already linked program can't be attached again. */ + link = bpf_program__attach(fentry_skel->progs.test1); + if (!ASSERT_ERR_PTR(link, "fentry_attach_link")) + return -1; prog_fd = bpf_program__fd(fentry_skel->progs.test1); err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "test_run", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); result = (__u64 *)fentry_skel->bss; - for (i = 0; i < 6; i++) { - if (CHECK(result[i] != 1, "result", - "fentry_test%d failed err %lld\n", i + 1, result[i])) - goto cleanup; + for (i = 0; i < sizeof(*fentry_skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(result[i], 1, "fentry_result")) + return -1; } + fentry_test__detach(fentry_skel); + + /* zero results for re-attach test */ + memset(fentry_skel->bss, 0, sizeof(*fentry_skel->bss)); + return 0; +} + +void test_fentry_test(void) +{ + struct fentry_test *fentry_skel = NULL; + int err; + + fentry_skel = fentry_test__open_and_load(); + if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) + goto cleanup; + + err = fentry_test(fentry_skel); + if (!ASSERT_OK(err, "fentry_first_attach")) + goto cleanup; + + err = fentry_test(fentry_skel); + ASSERT_OK(err, "fentry_second_attach"); + cleanup: fentry_test__destroy(fentry_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index 5c0448910426..63990842d20f 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -58,42 +58,73 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, test_cb cb) { struct bpf_object *obj = NULL, *tgt_obj; + __u32 retval, tgt_prog_id, info_len; + struct bpf_prog_info prog_info = {}; struct bpf_program **prog = NULL; struct bpf_link **link = NULL; - __u32 duration = 0, retval; int err, tgt_fd, i; + struct btf *btf; err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC, &tgt_obj, &tgt_fd); - if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n", - target_obj_file, err, errno)) + if (!ASSERT_OK(err, "tgt_prog_load")) return; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .attach_prog_fd = tgt_fd, ); + info_len = sizeof(prog_info); + err = bpf_obj_get_info_by_fd(tgt_fd, &prog_info, &info_len); + if (!ASSERT_OK(err, "tgt_fd_get_info")) + goto close_prog; + + tgt_prog_id = prog_info.id; + btf = bpf_object__btf(tgt_obj); + link = calloc(sizeof(struct bpf_link *), prog_cnt); + if (!ASSERT_OK_PTR(link, "link_ptr")) + goto close_prog; + prog = calloc(sizeof(struct bpf_program *), prog_cnt); - if (CHECK(!link || !prog, "alloc_memory", "failed to alloc memory")) + if (!ASSERT_OK_PTR(prog, "prog_ptr")) goto close_prog; obj = bpf_object__open_file(obj_file, &opts); - if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open %s: %ld\n", obj_file, - PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open")) goto close_prog; err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "err %d\n", err)) + if (!ASSERT_OK(err, "obj_load")) goto close_prog; for (i = 0; i < prog_cnt; i++) { + struct bpf_link_info link_info; + char *tgt_name; + __s32 btf_id; + + tgt_name = strstr(prog_name[i], "/"); + if (!ASSERT_OK_PTR(tgt_name, "tgt_name")) + goto close_prog; + btf_id = btf__find_by_name_kind(btf, tgt_name + 1, BTF_KIND_FUNC); + prog[i] = bpf_object__find_program_by_title(obj, prog_name[i]); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name[i])) + if (!ASSERT_OK_PTR(prog[i], prog_name[i])) goto close_prog; + link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) + if (!ASSERT_OK_PTR(link[i], "attach_trace")) goto close_prog; + + info_len = sizeof(link_info); + memset(&link_info, 0, sizeof(link_info)); + err = bpf_obj_get_info_by_fd(bpf_link__fd(link[i]), + &link_info, &info_len); + ASSERT_OK(err, "link_fd_get_info"); + ASSERT_EQ(link_info.tracing.attach_type, + bpf_program__get_expected_attach_type(prog[i]), + "link_attach_type"); + ASSERT_EQ(link_info.tracing.target_obj_id, tgt_prog_id, "link_tgt_obj_id"); + ASSERT_EQ(link_info.tracing.target_btf_id, btf_id, "link_tgt_btf_id"); } if (cb) { @@ -106,10 +137,9 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, goto close_prog; err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + NULL, NULL, &retval, NULL); + ASSERT_OK(err, "prog_run"); + ASSERT_EQ(retval, 0, "prog_run_ret"); if (check_data_map(obj, prog_cnt, false)) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c new file mode 100644 index 000000000000..ccc7e8a34ab6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <sched.h> +#include <test_progs.h> +#include <time.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include "fexit_sleep.skel.h" + +static int do_sleep(void *skel) +{ + struct fexit_sleep *fexit_skel = skel; + struct timespec ts1 = { .tv_nsec = 1 }; + struct timespec ts2 = { .tv_sec = 10 }; + + fexit_skel->bss->pid = getpid(); + (void)syscall(__NR_nanosleep, &ts1, NULL); + (void)syscall(__NR_nanosleep, &ts2, NULL); + return 0; +} + +#define STACK_SIZE (1024 * 1024) +static char child_stack[STACK_SIZE]; + +void test_fexit_sleep(void) +{ + struct fexit_sleep *fexit_skel = NULL; + int wstatus, duration = 0; + pid_t cpid; + int err, fexit_cnt; + + fexit_skel = fexit_sleep__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; + + err = fexit_sleep__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; + + cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) + goto cleanup; + + /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ + while (READ_ONCE(fexit_skel->bss->fentry_cnt) != 2); + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + + /* close progs and detach them. That will trigger two nop5->jmp5 rewrites + * in the trampolines to skip nanosleep_fexit prog. + * The nanosleep_fentry prog will get detached first. + * The nanosleep_fexit prog will get detached second. + * Detaching will trigger freeing of both progs JITed images. + * There will be two dying bpf_tramp_image-s, but only the initial + * bpf_tramp_image (with both _fentry and _fexit progs will be stuck + * waiting for percpu_ref_kill to confirm). The other one + * will be freed quickly. + */ + close(bpf_program__fd(fexit_skel->progs.nanosleep_fentry)); + close(bpf_program__fd(fexit_skel->progs.nanosleep_fexit)); + fexit_sleep__detach(fexit_skel); + + /* kill the thread to unwind sys_nanosleep stack through the trampoline */ + kill(cpid, 9); + + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) + goto cleanup; + if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) + goto cleanup; + + /* The bypassed nanosleep_fexit prog shouldn't have executed. + * Unlike progs the maps were not freed and directly accessible. + */ + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + +cleanup: + fexit_sleep__destroy(fexit_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 78d7a2765c27..6792e41f7f69 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -3,35 +3,57 @@ #include <test_progs.h> #include "fexit_test.skel.h" -void test_fexit_test(void) +static int fexit_test(struct fexit_test *fexit_skel) { - struct fexit_test *fexit_skel = NULL; int err, prog_fd, i; __u32 duration = 0, retval; + struct bpf_link *link; __u64 *result; - fexit_skel = fexit_test__open_and_load(); - if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) - goto cleanup; - err = fexit_test__attach(fexit_skel); - if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "fexit_attach")) + return err; + + /* Check that already linked program can't be attached again. */ + link = bpf_program__attach(fexit_skel->progs.test1); + if (!ASSERT_ERR_PTR(link, "fexit_attach_link")) + return -1; prog_fd = bpf_program__fd(fexit_skel->progs.test1); err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "test_run", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); result = (__u64 *)fexit_skel->bss; - for (i = 0; i < 6; i++) { - if (CHECK(result[i] != 1, "result", - "fexit_test%d failed err %lld\n", i + 1, result[i])) - goto cleanup; + for (i = 0; i < sizeof(*fexit_skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(result[i], 1, "fexit_result")) + return -1; } + fexit_test__detach(fexit_skel); + + /* zero results for re-attach test */ + memset(fexit_skel->bss, 0, sizeof(*fexit_skel->bss)); + return 0; +} + +void test_fexit_test(void) +{ + struct fexit_test *fexit_skel = NULL; + int err; + + fexit_skel = fexit_test__open_and_load(); + if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) + goto cleanup; + + err = fexit_test(fexit_skel); + if (!ASSERT_OK(err, "fexit_first_attach")) + goto cleanup; + + err = fexit_test(fexit_skel); + ASSERT_OK(err, "fexit_second_attach"); + cleanup: fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c new file mode 100644 index 000000000000..68eb12a287d4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include <network_helpers.h> +#include "for_each_hash_map_elem.skel.h" +#include "for_each_array_map_elem.skel.h" + +static unsigned int duration; + +static void test_hash_map(void) +{ + int i, err, hashmap_fd, max_entries, percpu_map_fd; + struct for_each_hash_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u32 key, num_cpus, retval; + __u64 val; + + skel = for_each_hash_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_hash_map_elem__open_and_load")) + return; + + hashmap_fd = bpf_map__fd(skel->maps.hashmap); + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + err = bpf_map_update_elem(hashmap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 1; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->hashmap_output, 4, "hashmap_output"); + ASSERT_EQ(skel->bss->hashmap_elems, max_entries, "hashmap_elems"); + + key = 1; + err = bpf_map_lookup_elem(hashmap_fd, &key, &val); + ASSERT_ERR(err, "hashmap_lookup"); + + ASSERT_EQ(skel->bss->percpu_called, 1, "percpu_called"); + ASSERT_LT(skel->bss->cpu, num_cpus, "num_cpus"); + ASSERT_EQ(skel->bss->percpu_map_elems, 1, "percpu_map_elems"); + ASSERT_EQ(skel->bss->percpu_key, 1, "percpu_key"); + ASSERT_EQ(skel->bss->percpu_val, skel->bss->cpu + 1, "percpu_val"); + ASSERT_EQ(skel->bss->percpu_output, 100, "percpu_output"); +out: + free(percpu_valbuf); + for_each_hash_map_elem__destroy(skel); +} + +static void test_array_map(void) +{ + __u32 key, num_cpus, max_entries, retval; + int i, arraymap_fd, percpu_map_fd, err; + struct for_each_array_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u64 val, expected_total; + + skel = for_each_array_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_array_map_elem__open_and_load")) + return; + + arraymap_fd = bpf_map__fd(skel->maps.arraymap); + expected_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + /* skip the last iteration for expected total */ + if (i != max_entries - 1) + expected_total += val; + err = bpf_map_update_elem(arraymap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 0; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->arraymap_output, expected_total, "array_output"); + ASSERT_EQ(skel->bss->cpu + 1, skel->bss->percpu_val, "percpu_val"); + +out: + free(percpu_valbuf); + for_each_array_map_elem__destroy(skel); +} + +void test_for_each(void) +{ + if (test__start_subtest("hash_map")) + test_hash_map(); + if (test__start_subtest("array_map")) + test_array_map(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 42c3a3103c26..d65107919998 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -134,7 +134,7 @@ void test_kfree_skb(void) /* make sure kfree_skb program was triggered * and it sent expected skb into ring buffer */ - CHECK_FAIL(!passed); + ASSERT_TRUE(passed, "passed"); err = bpf_map_lookup_elem(bpf_map__fd(global_data), &zero, test_ok); if (CHECK(err, "get_result", diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c new file mode 100644 index 000000000000..7fc0951ee75f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include <network_helpers.h> +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test_subprog.skel.h" + +static void test_main(void) +{ + struct kfunc_call_test *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 12, "test1-retval"); + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test2)"); + ASSERT_EQ(retval, 3, "test2-retval"); + + kfunc_call_test__destroy(skel); +} + +static void test_subprog(void) +{ + struct kfunc_call_test_subprog *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test_subprog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 10, "test1-retval"); + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); + ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + + kfunc_call_test_subprog__destroy(skel); +} + +void test_kfunc_call(void) +{ + if (test__start_subtest("main")) + test_main(); + + if (test__start_subtest("subprog")) + test_subprog(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_funcs.c b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c new file mode 100644 index 000000000000..e9916f2817ec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_funcs.skel.h" + +void test_linked_funcs(void) +{ + int err; + struct linked_funcs *skel; + + skel = linked_funcs__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->my_tid = syscall(SYS_gettid); + skel->bss->syscall_id = SYS_getpgid; + + err = linked_funcs__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_funcs__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_val1, 2000 + 2000, "output_val1"); + ASSERT_EQ(skel->bss->output_ctx1, SYS_getpgid, "output_ctx1"); + ASSERT_EQ(skel->bss->output_weak1, 42, "output_weak1"); + + ASSERT_EQ(skel->bss->output_val2, 2 * 1000 + 2 * (2 * 1000), "output_val2"); + ASSERT_EQ(skel->bss->output_ctx2, SYS_getpgid, "output_ctx2"); + /* output_weak2 should never be updated */ + ASSERT_EQ(skel->bss->output_weak2, 0, "output_weak2"); + +cleanup: + linked_funcs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_maps.c b/tools/testing/selftests/bpf/prog_tests/linked_maps.c new file mode 100644 index 000000000000..85dcaaaf2775 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_maps.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_maps.skel.h" + +void test_linked_maps(void) +{ + int err; + struct linked_maps *skel; + + skel = linked_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + err = linked_maps__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_first1, 2000, "output_first1"); + ASSERT_EQ(skel->bss->output_second1, 2, "output_second1"); + ASSERT_EQ(skel->bss->output_weak1, 2, "output_weak1"); + +cleanup: + linked_maps__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_vars.c b/tools/testing/selftests/bpf/prog_tests/linked_vars.c new file mode 100644 index 000000000000..267166abe4c1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_vars.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "linked_vars.skel.h" + +void test_linked_vars(void) +{ + int err; + struct linked_vars *skel; + + skel = linked_vars__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->input_bss1 = 1000; + skel->bss->input_bss2 = 2000; + skel->bss->input_bss_weak = 3000; + + err = linked_vars__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_vars__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_bss1, 1000 + 2000 + 3000, "output_bss1"); + ASSERT_EQ(skel->bss->output_bss2, 1000 + 2000 + 3000, "output_bss2"); + /* 10 comes from "winner" input_data_weak in first obj file */ + ASSERT_EQ(skel->bss->output_data1, 1 + 2 + 10, "output_bss1"); + ASSERT_EQ(skel->bss->output_data2, 1 + 2 + 10, "output_bss2"); + /* 100 comes from "winner" input_rodata_weak in first obj file */ + ASSERT_EQ(skel->bss->output_rodata1, 11 + 22 + 100, "output_weak1"); + ASSERT_EQ(skel->bss->output_rodata2, 11 + 22 + 100, "output_weak2"); + +cleanup: + linked_vars__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c index c230a573c373..4972f92205c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_ptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c @@ -12,11 +12,22 @@ void test_map_ptr(void) __u32 duration = 0, retval; char buf[128]; int err; + int page_size = getpagesize(); - skel = map_ptr_kern__open_and_load(); - if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + skel = map_ptr_kern__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) return; + err = bpf_map__set_max_entries(skel->maps.m_ringbuf, page_size); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto cleanup; + + err = map_ptr_kern__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + skel->bss->page_size = page_size; + err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4, sizeof(pkt_v4), buf, NULL, &retval, NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 9c3c5c0f068f..37b002ca1167 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -29,22 +29,36 @@ void test_mmap(void) struct test_mmap *skel; __u64 val = 0; - skel = test_mmap__open_and_load(); - if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + skel = test_mmap__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.rdonly_map, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + /* at least 4 pages of data */ + err = bpf_map__set_max_entries(skel->maps.data_map, + 4 * (page_size / sizeof(u64))); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_mmap__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + bss_map = skel->maps.bss; data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); - tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { - munmap(tmp1, 4096); + munmap(tmp1, page_size); goto cleanup; } /* now double-check if it's mmap()'able at all */ - tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 5bc53d53d86e..d85a69b7ce44 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -45,12 +45,18 @@ static int trigger_module_test_write(int write_sz) return 0; } +static int delete_module(const char *name, int flags) +{ + return syscall(__NR_delete_module, name, flags); +} + void test_module_attach(void) { const int READ_SZ = 456; const int WRITE_SZ = 457; struct test_module_attach* skel; struct test_module_attach__bss *bss; + struct bpf_link *link; int err; skel = test_module_attach__open(); @@ -84,6 +90,23 @@ void test_module_attach(void) ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); + test_module_attach__detach(skel); + + /* attach fentry/fexit and make sure it get's module reference */ + link = bpf_program__attach(skel->progs.handle_fentry); + if (!ASSERT_OK_PTR(link, "attach_fentry")) + goto cleanup; + + ASSERT_ERR(delete_module("bpf_testmod", 0), "delete_module"); + bpf_link__destroy(link); + + link = bpf_program__attach(skel->progs.handle_fexit); + if (!ASSERT_OK_PTR(link, "attach_fexit")) + goto cleanup; + + ASSERT_ERR(delete_module("bpf_testmod", 0), "delete_module"); + bpf_link__destroy(link); + cleanup: test_module_attach__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 31a3114906e2..2535788e135f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -68,10 +68,10 @@ static void test_ns_current_pid_tgid_new_ns(void) cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, CLONE_NEWPID | SIGCHLD, NULL); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) return; if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 935a294f049a..131d7f7eeb42 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -2,12 +2,31 @@ #include <test_progs.h> #include <network_helpers.h> -void test_prog_run_xattr(void) +#include "test_pkt_access.skel.h" + +static const __u32 duration; + +static void check_run_cnt(int prog_fd, __u64 run_cnt) { - const char *file = "./test_pkt_access.o"; - struct bpf_object *obj; - char buf[10]; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd)) + return; + + CHECK(run_cnt != info.run_cnt, "run_cnt", + "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt); +} + +void test_prog_run_xattr(void) +{ + struct test_pkt_access *skel; + int err, stats_fd = -1; + char buf[10] = {}; + __u64 run_cnt = 0; + struct bpf_prog_test_run_attr tattr = { .repeat = 1, .data_in = &pkt_v4, @@ -16,12 +35,15 @@ void test_prog_run_xattr(void) .data_size_out = 5, }; - err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, - &tattr.prog_fd); - if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK_ATTR(stats_fd < 0, "enable_stats", "failed %d\n", errno)) return; - memset(buf, 0, sizeof(buf)); + skel = test_pkt_access__open_and_load(); + if (CHECK_ATTR(!skel, "open_and_load", "failed\n")) + goto cleanup; + + tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access); err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run", @@ -34,8 +56,12 @@ void test_prog_run_xattr(void) CHECK_ATTR(buf[5] != 0, "overflow", "BPF_PROG_TEST_RUN ignored size hint\n"); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + tattr.data_out = NULL; tattr.data_size_out = 0; + tattr.repeat = 2; errno = 0; err = bpf_prog_test_run_xattr(&tattr); @@ -46,5 +72,12 @@ void test_prog_run_xattr(void) err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err); - bpf_object__close(obj); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + +cleanup: + if (skel) + test_pkt_access__destroy(skel); + if (stats_fd != -1) + close(stats_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index 6ace5e9efec1..d3c2de2c24d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -160,11 +160,8 @@ int test_resolve_btfids(void) break; if (i > 0) { - ret = CHECK(test_set.ids[i - 1] > test_set.ids[i], - "sort_check", - "test_set is not sorted\n"); - if (ret) - break; + if (!ASSERT_LE(test_set.ids[i - 1], test_set.ids[i], "sort_check")) + return -1; } } diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index fddbc5db5d6a..de78617f6550 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -87,11 +87,20 @@ void test_ringbuf(void) pthread_t thread; long bg_ret = -1; int err, cnt; + int page_size = getpagesize(); - skel = test_ringbuf__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_ringbuf__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -110,9 +119,9 @@ void test_ringbuf(void) CHECK(skel->bss->avail_data != 3 * rec_sz, "err_avail_size", "exp %ld, got %ld\n", 3L * rec_sz, skel->bss->avail_data); - CHECK(skel->bss->ring_size != 4096, + CHECK(skel->bss->ring_size != page_size, "err_ring_size", "exp %ld, got %ld\n", - 4096L, skel->bss->ring_size); + (long)page_size, skel->bss->ring_size); CHECK(skel->bss->cons_pos != 0, "err_cons_pos", "exp %ld, got %ld\n", 0L, skel->bss->cons_pos); diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index d37161e59bb2..cef63e703924 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -41,13 +41,42 @@ static int process_sample(void *ctx, void *data, size_t len) void test_ringbuf_multi(void) { struct test_ringbuf_multi *skel; - struct ring_buffer *ringbuf; + struct ring_buffer *ringbuf = NULL; int err; + int page_size = getpagesize(); + int proto_fd = -1; - skel = test_ringbuf_multi__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf_multi__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf1, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.ringbuf2, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(bpf_map__inner_map(skel->maps.ringbuf_arr), page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0); + if (CHECK(proto_fd == -1, "bpf_create_map", "bpf_create_map failed\n")) + goto cleanup; + + err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd); + if (CHECK(err != 0, "bpf_map__set_inner_map_fd", "bpf_map__set_inner_map_fd failed\n")) + goto cleanup; + + err = test_ringbuf_multi__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + + close(proto_fd); + proto_fd = -1; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -97,6 +126,8 @@ void test_ringbuf_multi(void) 2L, skel->bss->total); cleanup: + if (proto_fd >= 0) + close(proto_fd); ring_buffer__free(ringbuf); test_ringbuf_multi__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 9ff0412e1fd3..45c82db3c58c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -241,6 +241,48 @@ fail: return -1; } +static __u64 socket_cookie(int fd) +{ + __u64 cookie; + socklen_t cookie_len = sizeof(cookie); + + if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0, + "getsockopt(SO_COOKIE)", "%s\n", strerror(errno))) + return 0; + return cookie; +} + +static int fill_sk_lookup_ctx(struct bpf_sk_lookup *ctx, const char *local_ip, __u16 local_port, + const char *remote_ip, __u16 remote_port) +{ + void *local, *remote; + int err; + + memset(ctx, 0, sizeof(*ctx)); + ctx->local_port = local_port; + ctx->remote_port = htons(remote_port); + + if (is_ipv6(local_ip)) { + ctx->family = AF_INET6; + local = &ctx->local_ip6[0]; + remote = &ctx->remote_ip6[0]; + } else { + ctx->family = AF_INET; + local = &ctx->local_ip4; + remote = &ctx->remote_ip4; + } + + err = inet_pton(ctx->family, local_ip, local); + if (CHECK(err != 1, "inet_pton", "local_ip failed\n")) + return 1; + + err = inet_pton(ctx->family, remote_ip, remote); + if (CHECK(err != 1, "inet_pton", "remote_ip failed\n")) + return 1; + + return 0; +} + static int send_byte(int fd) { ssize_t n; @@ -1009,18 +1051,27 @@ static void test_drop_on_reuseport(struct test_sk_lookup *skel) static void run_sk_assign(struct test_sk_lookup *skel, struct bpf_program *lookup_prog, - const char *listen_ip, const char *connect_ip) + const char *remote_ip, const char *local_ip) { - int client_fd, peer_fd, server_fds[MAX_SERVERS] = { -1 }; - struct bpf_link *lookup_link; + int server_fds[MAX_SERVERS] = { -1 }; + struct bpf_sk_lookup ctx; + __u64 server_cookie; int i, err; - lookup_link = attach_lookup_prog(lookup_prog); - if (!lookup_link) + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = &ctx, + .ctx_size_in = sizeof(ctx), + .ctx_out = &ctx, + .ctx_size_out = sizeof(ctx), + ); + + if (fill_sk_lookup_ctx(&ctx, local_ip, EXT_PORT, remote_ip, INT_PORT)) return; + ctx.protocol = IPPROTO_TCP; + for (i = 0; i < ARRAY_SIZE(server_fds); i++) { - server_fds[i] = make_server(SOCK_STREAM, listen_ip, 0, NULL); + server_fds[i] = make_server(SOCK_STREAM, local_ip, 0, NULL); if (server_fds[i] < 0) goto close_servers; @@ -1030,23 +1081,25 @@ static void run_sk_assign(struct test_sk_lookup *skel, goto close_servers; } - client_fd = make_client(SOCK_STREAM, connect_ip, EXT_PORT); - if (client_fd < 0) + server_cookie = socket_cookie(server_fds[SERVER_B]); + if (!server_cookie) + return; + + err = bpf_prog_test_run_opts(bpf_program__fd(lookup_prog), &opts); + if (CHECK(err, "test_run", "failed with error %d\n", errno)) + goto close_servers; + + if (CHECK(ctx.cookie == 0, "ctx.cookie", "no socket selected\n")) goto close_servers; - peer_fd = accept(server_fds[SERVER_B], NULL, NULL); - if (CHECK(peer_fd < 0, "accept", "failed\n")) - goto close_client; + CHECK(ctx.cookie != server_cookie, "ctx.cookie", + "selected sk %llu instead of %llu\n", ctx.cookie, server_cookie); - close(peer_fd); -close_client: - close(client_fd); close_servers: for (i = 0; i < ARRAY_SIZE(server_fds); i++) { if (server_fds[i] != -1) close(server_fds[i]); } - bpf_link__destroy(lookup_link); } static void run_sk_assign_v4(struct test_sk_lookup *skel, diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c new file mode 100644 index 000000000000..a958c22aec75 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <test_progs.h> +#include "test_snprintf.skel.h" +#include "test_snprintf_single.skel.h" + +#define EXP_NUM_OUT "-8 9 96 -424242 1337 DABBAD00" +#define EXP_NUM_RET sizeof(EXP_NUM_OUT) + +#define EXP_IP_OUT "127.000.000.001 0000:0000:0000:0000:0000:0000:0000:0001" +#define EXP_IP_RET sizeof(EXP_IP_OUT) + +/* The third specifier, %pB, depends on compiler inlining so don't check it */ +#define EXP_SYM_OUT "schedule schedule+0x0/" +#define MIN_SYM_RET sizeof(EXP_SYM_OUT) + +/* The third specifier, %p, is a hashed pointer which changes on every reboot */ +#define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 " +#define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr") + +#define EXP_STR_OUT "str1 longstr" +#define EXP_STR_RET sizeof(EXP_STR_OUT) + +#define EXP_OVER_OUT "%over" +#define EXP_OVER_RET 10 + +#define EXP_PAD_OUT " 4 000" +#define EXP_PAD_RET 900007 + +#define EXP_NO_ARG_OUT "simple case" +#define EXP_NO_ARG_RET 12 + +#define EXP_NO_BUF_RET 29 + +void test_snprintf_positive(void) +{ + char exp_addr_out[] = EXP_ADDR_OUT; + char exp_sym_out[] = EXP_SYM_OUT; + struct test_snprintf *skel; + + skel = test_snprintf__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + ASSERT_STREQ(skel->bss->num_out, EXP_NUM_OUT, "num_out"); + ASSERT_EQ(skel->bss->num_ret, EXP_NUM_RET, "num_ret"); + + ASSERT_STREQ(skel->bss->ip_out, EXP_IP_OUT, "ip_out"); + ASSERT_EQ(skel->bss->ip_ret, EXP_IP_RET, "ip_ret"); + + ASSERT_OK(memcmp(skel->bss->sym_out, exp_sym_out, + sizeof(exp_sym_out) - 1), "sym_out"); + ASSERT_LT(MIN_SYM_RET, skel->bss->sym_ret, "sym_ret"); + + ASSERT_OK(memcmp(skel->bss->addr_out, exp_addr_out, + sizeof(exp_addr_out) - 1), "addr_out"); + ASSERT_EQ(skel->bss->addr_ret, EXP_ADDR_RET, "addr_ret"); + + ASSERT_STREQ(skel->bss->str_out, EXP_STR_OUT, "str_out"); + ASSERT_EQ(skel->bss->str_ret, EXP_STR_RET, "str_ret"); + + ASSERT_STREQ(skel->bss->over_out, EXP_OVER_OUT, "over_out"); + ASSERT_EQ(skel->bss->over_ret, EXP_OVER_RET, "over_ret"); + + ASSERT_STREQ(skel->bss->pad_out, EXP_PAD_OUT, "pad_out"); + ASSERT_EQ(skel->bss->pad_ret, EXP_PAD_RET, "pad_ret"); + + ASSERT_STREQ(skel->bss->noarg_out, EXP_NO_ARG_OUT, "no_arg_out"); + ASSERT_EQ(skel->bss->noarg_ret, EXP_NO_ARG_RET, "no_arg_ret"); + + ASSERT_EQ(skel->bss->nobuf_ret, EXP_NO_BUF_RET, "no_buf_ret"); + +cleanup: + test_snprintf__destroy(skel); +} + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +/* Loads an eBPF object calling bpf_snprintf with up to 10 characters of fmt */ +static int load_single_snprintf(char *fmt) +{ + struct test_snprintf_single *skel; + int ret; + + skel = test_snprintf_single__open(); + if (!skel) + return -EINVAL; + + memcpy(skel->rodata->fmt, fmt, min(strlen(fmt) + 1, 10)); + + ret = test_snprintf_single__load(skel); + test_snprintf_single__destroy(skel); + + return ret; +} + +void test_snprintf_negative(void) +{ + ASSERT_OK(load_single_snprintf("valid %d"), "valid usage"); + + ASSERT_ERR(load_single_snprintf("0123456789"), "no terminating zero"); + ASSERT_ERR(load_single_snprintf("%d %d"), "too many specifiers"); + ASSERT_ERR(load_single_snprintf("%pi5"), "invalid specifier 1"); + ASSERT_ERR(load_single_snprintf("%a"), "invalid specifier 2"); + ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3"); + ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4"); + ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); + ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); + ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); +} + +void test_snprintf(void) +{ + if (test__start_subtest("snprintf_positive")) + test_snprintf_positive(); + if (test__start_subtest("snprintf_negative")) + test_snprintf_negative(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c b/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c index 686b40f11a45..76e1f5fe18fa 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c @@ -42,9 +42,7 @@ void test_snprintf_btf(void) * and it set expected return values from bpf_trace_printk()s * and all tests ran. */ - if (CHECK(bss->ret <= 0, - "bpf_snprintf_btf: got return value", - "ret <= 0 %ld test %d\n", bss->ret, bss->ran_subtests)) + if (!ASSERT_GT(bss->ret, 0, "bpf_snprintf_ret")) goto cleanup; if (CHECK(bss->ran_subtests == 0, "check if subtests ran", diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index b8b48cac2ac3..ab77596b64e3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -7,6 +7,7 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_update.skel.h" #include "test_sockmap_invalid_update.skel.h" +#include "test_sockmap_skb_verdict_attach.skel.h" #include "bpf_iter_sockmap.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -281,6 +282,39 @@ out: bpf_iter_sockmap__destroy(skel); } +static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first, + enum bpf_attach_type second) +{ + struct test_sockmap_skb_verdict_attach *skel; + int err, map, verdict; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_sockmap_skb_verdict_attach__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, first, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_attach(verdict, map, second, 0); + assert(err == -1 && errno == EBUSY); + + err = bpf_prog_detach2(verdict, map, first); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -301,4 +335,10 @@ void test_sockmap_basic(void) test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash copy")) test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap skb_verdict attach")) { + test_sockmap_skb_verdict_attach(BPF_SK_SKB_VERDICT, + BPF_SK_SKB_STREAM_VERDICT); + test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_VERDICT); + } } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..648d9ae898d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1603,6 +1603,141 @@ static void test_reuseport(struct test_sockmap_listen *skel, } } +static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int c0, c1, p0, p1; + unsigned int pass; + socklen_t len; + int err, n; + u64 value; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + p0 = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (p0 < 0) + return; + len = sizeof(addr); + err = xgetsockname(p0, sockaddr(&addr), &len); + if (err) + goto close_peer0; + + c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0); + if (c0 < 0) + goto close_peer0; + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + err = xgetsockname(c0, sockaddr(&addr), &len); + if (err) + goto close_cli0; + err = xconnect(p0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + p1 = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (p1 < 0) + goto close_cli0; + err = xgetsockname(p1, sockaddr(&addr), &len); + if (err) + goto close_cli0; + + c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0); + if (c1 < 0) + goto close_peer1; + err = xconnect(c1, sockaddr(&addr), len); + if (err) + goto close_cli1; + err = xgetsockname(c1, sockaddr(&addr), &len); + if (err) + goto close_cli1; + err = xconnect(p1, sockaddr(&addr), len); + if (err) + goto close_cli1; + + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_cli1; + + key = 1; + value = p1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_cli1; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_cli1; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close_cli1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + if (n < 0) + FAIL_ERRNO("%s: read", log_prefix); + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close_cli1: + xclose(c1); +close_peer1: + xclose(p1); +close_cli0: + xclose(c0); +close_peer0: + xclose(p0); +} + +static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); + skel->bss->test_ingress = true; + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + udp_skb_redir_to_connected(skel, map, family); +} + static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, int family) { @@ -1611,6 +1746,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, test_redir(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_DGRAM); + test_udp_redir(skel, map, family); } void test_sockmap_listen(void) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index d5b44b135c00..4b937e5dbaca 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -3,6 +3,7 @@ #include "cgroup_helpers.h" #include <linux/tcp.h> +#include "sockopt_sk.skel.h" #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -191,60 +192,30 @@ err: return -1; } -static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title) +static void run_test(int cgroup_fd) { - enum bpf_attach_type attach_type; - enum bpf_prog_type prog_type; - struct bpf_program *prog; - int err; + struct sockopt_sk *skel; - err = libbpf_prog_type_by_name(title, &prog_type, &attach_type); - if (err) { - log_err("Failed to deduct types for %s BPF program", title); - return -1; - } + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; - prog = bpf_object__find_program_by_title(obj, title); - if (!prog) { - log_err("Failed to find %s BPF program", title); - return -1; - } - - err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, - attach_type, 0); - if (err) { - log_err("Failed to attach %s BPF program", title); - return -1; - } - - return 0; -} - -static void run_test(int cgroup_fd) -{ - struct bpf_prog_load_attr attr = { - .file = "./sockopt_sk.o", - }; - struct bpf_object *obj; - int ignored; - int err; - - err = bpf_prog_load_xattr(&attr, &obj, &ignored); - if (CHECK_FAIL(err)) - return; + skel->bss->page_size = getpagesize(); - err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._setsockopt = + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) + goto cleanup; - err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._getsockopt = + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) + goto cleanup; - CHECK_FAIL(getsetsockopt()); + ASSERT_OK(getsetsockopt(), "getsetsockopt"); -close_bpf_object: - bpf_object__close(obj); +cleanup: + sockopt_sk__destroy(skel); } void test_sockopt_sk(void) diff --git a/tools/testing/selftests/bpf/prog_tests/static_linked.c b/tools/testing/selftests/bpf/prog_tests/static_linked.c new file mode 100644 index 000000000000..46556976dccc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/static_linked.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include <test_progs.h> +#include "test_static_linked.skel.h" + +void test_static_linked(void) +{ + int err; + struct test_static_linked* skel; + + skel = test_static_linked__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->rovar1 = 1; + skel->bss->static_var1 = 2; + skel->bss->static_var11 = 3; + + skel->rodata->rovar2 = 4; + skel->bss->static_var2 = 5; + skel->bss->static_var22 = 6; + + err = test_static_linked__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = test_static_linked__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + usleep(1); + + ASSERT_EQ(skel->bss->var1, 1 * 2 + 2 + 3, "var1"); + ASSERT_EQ(skel->bss->var2, 4 * 3 + 5 + 6, "var2"); + +cleanup: + test_static_linked__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c new file mode 100644 index 000000000000..035c263aab1b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include <unistd.h> +#include <sys/syscall.h> /* For SYS_xxx definitions */ +#include <sys/types.h> +#include <test_progs.h> +#include "task_local_storage.skel.h" +#include "task_local_storage_exit_creds.skel.h" +#include "task_ls_recursion.skel.h" + +static void test_sys_enter_exit(void) +{ + struct task_local_storage *skel; + int err; + + skel = task_local_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + skel->bss->target_pid = syscall(SYS_gettid); + + err = task_local_storage__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + syscall(SYS_gettid); + syscall(SYS_gettid); + + /* 3x syscalls: 1x attach and 2x gettid */ + ASSERT_EQ(skel->bss->enter_cnt, 3, "enter_cnt"); + ASSERT_EQ(skel->bss->exit_cnt, 3, "exit_cnt"); + ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt"); +out: + task_local_storage__destroy(skel); +} + +static void test_exit_creds(void) +{ + struct task_local_storage_exit_creds *skel; + int err; + + skel = task_local_storage_exit_creds__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_local_storage_exit_creds__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger at least one exit_creds() */ + if (CHECK_FAIL(system("ls > /dev/null"))) + goto out; + + /* sync rcu to make sure exit_creds() is called for "ls" */ + kern_sync_rcu(); + ASSERT_EQ(skel->bss->valid_ptr_count, 0, "valid_ptr_count"); + ASSERT_NEQ(skel->bss->null_ptr_count, 0, "null_ptr_count"); +out: + task_local_storage_exit_creds__destroy(skel); +} + +static void test_recursion(void) +{ + struct task_ls_recursion *skel; + int err; + + skel = task_ls_recursion__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_ls_recursion__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger sys_enter, make sure it does not cause deadlock */ + syscall(SYS_gettid); + +out: + task_ls_recursion__destroy(skel); +} + +void test_task_local_storage(void) +{ + if (test__start_subtest("sys_enter_exit")) + test_sys_enter_exit(); + if (test__start_subtest("exit_creds")) + test_exit_creds(); + if (test__start_subtest("recursion")) + test_recursion(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c index b54bc0c351b7..0252f61d611a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_ima.c +++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c @@ -68,7 +68,8 @@ void test_test_ima(void) goto close_prog; snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); - if (CHECK_FAIL(system(cmd))) + err = system(cmd); + if (CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno)) goto close_clean; err = run_measured_process(measured_dir, &skel->bss->monitored_pid); @@ -81,7 +82,8 @@ void test_test_ima(void) close_clean: snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); - CHECK_FAIL(system(cmd)); + err = system(cmd); + CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno); close_prog: ima__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 2755e4f81499..244c01125126 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -51,43 +51,64 @@ int exec_cmd(int *monitored_pid) return -EINVAL; } -void test_test_lsm(void) +static int test_lsm(struct lsm *skel) { - struct lsm *skel = NULL; - int err, duration = 0; + struct bpf_link *link; int buf = 1234; - - skel = lsm__open_and_load(); - if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) - goto close_prog; + int err; err = lsm__attach(skel); - if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) - goto close_prog; + if (!ASSERT_OK(err, "attach")) + return err; + + /* Check that already linked program can't be attached again. */ + link = bpf_program__attach(skel->progs.test_int_hook); + if (!ASSERT_ERR_PTR(link, "attach_link")) + return -1; err = exec_cmd(&skel->bss->monitored_pid); - if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) - goto close_prog; + if (!ASSERT_OK(err, "exec_cmd")) + return err; - CHECK(skel->bss->bprm_count != 1, "bprm_count", "bprm_count = %d\n", - skel->bss->bprm_count); + ASSERT_EQ(skel->bss->bprm_count, 1, "bprm_count"); skel->bss->monitored_pid = getpid(); err = stack_mprotect(); - if (CHECK(errno != EPERM, "stack_mprotect", "want err=EPERM, got %d\n", - errno)) - goto close_prog; + if (!ASSERT_EQ(errno, EPERM, "stack_mprotect")) + return err; - CHECK(skel->bss->mprotect_count != 1, "mprotect_count", - "mprotect_count = %d\n", skel->bss->mprotect_count); + ASSERT_EQ(skel->bss->mprotect_count, 1, "mprotect_count"); syscall(__NR_setdomainname, &buf, -2L); syscall(__NR_setdomainname, 0, -3L); syscall(__NR_setdomainname, ~0L, -4L); - CHECK(skel->bss->copy_test != 3, "copy_test", - "copy_test = %d\n", skel->bss->copy_test); + ASSERT_EQ(skel->bss->copy_test, 3, "copy_test"); + + lsm__detach(skel); + + skel->bss->copy_test = 0; + skel->bss->bprm_count = 0; + skel->bss->mprotect_count = 0; + return 0; +} + +void test_test_lsm(void) +{ + struct lsm *skel = NULL; + int err; + + skel = lsm__open_and_load(); + if (!ASSERT_OK_PTR(skel, "lsm_skel_load")) + goto close_prog; + + err = test_lsm(skel); + if (!ASSERT_OK(err, "test_lsm_first_attach")) + goto close_prog; + + err = test_lsm(skel); + ASSERT_OK(err, "test_lsm_second_attach"); close_prog: lsm__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 115a3b0ad984..474c6a62078a 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -57,6 +57,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -127,6 +148,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 4c0d348034b9..c19cfa869f30 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -63,6 +63,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -141,6 +162,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 6939bfd8690f..f62df4d023f9 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -174,8 +174,8 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) * as long as it is used in one of the func ptr * under SEC(".struct_ops"). */ -SEC("struct_ops/bictcp_init") -void BPF_PROG(bictcp_init, struct sock *sk) +SEC("struct_ops/bpf_cubic_init") +void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -192,7 +192,7 @@ void BPF_PROG(bictcp_init, struct sock *sk) * The remaining tcp-cubic functions have an easier way. */ SEC("no-sec-prefix-bictcp_cwnd_event") -void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event) +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -384,7 +384,7 @@ tcp_friendliness: } /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -403,7 +403,7 @@ void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -420,7 +420,7 @@ __u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state) +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -496,7 +496,7 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) } } -void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -525,21 +525,21 @@ void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, hystart_update(sk, delay); } -__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; - return max(tp->snd_cwnd, tp->prior_cwnd); +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); } SEC(".struct_ops") struct tcp_congestion_ops cubic = { - .init = (void *)bictcp_init, - .ssthresh = (void *)bictcp_recalc_ssthresh, - .cong_avoid = (void *)bictcp_cong_avoid, - .set_state = (void *)bictcp_state, - .undo_cwnd = (void *)tcp_reno_undo_cwnd, - .cwnd_event = (void *)bictcp_cwnd_event, - .pkts_acked = (void *)bictcp_acked, + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_avoid = (void *)bpf_cubic_cong_avoid, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, .name = "bpf_cubic", }; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 4dc1a967776a..fd42247da8b4 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -194,22 +194,12 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } -SEC("struct_ops/tcp_reno_cong_avoid") -void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tcp_is_cwnd_limited(sk)) - return; +extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; - /* In "safe" area, increase. */ - if (tcp_in_slow_start(tp)) { - acked = tcp_slow_start(tp, acked); - if (!acked) - return; - } - /* In dangerous area, increase slowly. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); +SEC("struct_ops/dctcp_reno_cong_avoid") +void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +{ + tcp_reno_cong_avoid(sk, ack, acked); } SEC(".struct_ops") @@ -226,7 +216,7 @@ struct tcp_congestion_ops dctcp = { .in_ack_event = (void *)dctcp_update_alpha, .cwnd_event = (void *)dctcp_cwnd_event, .ssthresh = (void *)dctcp_ssthresh, - .cong_avoid = (void *)tcp_reno_cong_avoid, + .cong_avoid = (void *)dctcp_cong_avoid, .undo_cwnd = (void *)dctcp_cwnd_undo, .set_state = (void *)dctcp_state, .flags = TCP_CONG_NEEDS_ECN, diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c index 50e59a2e142e..43c36f5f7649 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c @@ -35,3 +35,30 @@ int dump_task_stack(struct bpf_iter__task *ctx) return 0; } + +SEC("iter/task") +int get_task_user_stacks(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + uint64_t buf_sz = 0; + int64_t res; + + if (task == (void *)0) + return 0; + + res = bpf_get_task_stack(task, entries, + MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, BPF_F_USER_STACK); + if (res <= 0) + return 0; + + buf_sz += res; + + /* If the verifier doesn't refine bpf_get_task_stack res, and instead + * assumes res is entirely unknown, this program will fail to load as + * the verifier will believe that max buf_sz value allows reading + * past the end of entries in bpf_seq_write call + */ + bpf_seq_write(seq, &entries, buf_sz); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c new file mode 100644 index 000000000000..2ecd833dcd41 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <linux/types.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "X"; + +void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk) +{ +} + +SEC(".struct_ops") +struct tcp_congestion_ops bpf_nogpltcp = { + .init = (void *)nogpltcp_init, + .name = "bpf_nogpltcp", +}; diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c deleted file mode 100644 index dd0ffa518f36..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_arr_kind x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c deleted file mode 100644 index bc83372088ad..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_arr_value_type x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c deleted file mode 100644 index 917bec41be08..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_int_kind x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c deleted file mode 100644 index 6ec7e6ec1c91..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_int_sz x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c deleted file mode 100644 index 7bbcacf2b0d1..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_int_type x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c deleted file mode 100644 index f384dd38ec70..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_struct_type x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c new file mode 100644 index 000000000000..d14b496190c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_existence___wrong_field_defs x) {} diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 31975c96e2c9..8aaa24a00322 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -174,6 +174,12 @@ struct struct_in_struct { }; }; +struct struct_in_array {}; + +struct struct_in_array_typed {}; + +typedef struct struct_in_array_typed struct_in_array_t[2]; + struct struct_with_embedded_stuff { int a; struct { @@ -203,6 +209,14 @@ struct struct_with_embedded_stuff { } r[5]; struct struct_in_struct s[10]; int t[11]; + struct struct_in_array (*u)[2]; + struct_in_array_t *v; +}; + +struct float_struct { + float f; + const double *d; + volatile long double *ld; }; struct root_struct { @@ -219,6 +233,7 @@ struct root_struct { union_fwd_t *_12; union_fwd_ptr_t _13; struct struct_with_embedded_stuff _14; + struct float_struct _15; }; /* ------ END-EXPECTED-OUTPUT ------ */ diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 9a2850850121..c95c0cabe951 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -700,27 +700,11 @@ struct core_reloc_existence___minimal { int a; }; -struct core_reloc_existence___err_wrong_int_sz { - short a; -}; - -struct core_reloc_existence___err_wrong_int_type { +struct core_reloc_existence___wrong_field_defs { + void *a; int b[1]; -}; - -struct core_reloc_existence___err_wrong_int_kind { struct{ int x; } c; -}; - -struct core_reloc_existence___err_wrong_arr_kind { int arr; -}; - -struct core_reloc_existence___err_wrong_arr_value_type { - short arr[1]; -}; - -struct core_reloc_existence___err_wrong_struct_type { int s; }; @@ -807,6 +791,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -816,6 +801,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; struct core_reloc_size___diff_sz { @@ -825,6 +811,7 @@ struct core_reloc_size___diff_sz { char arr_field[10]; void *ptr_field; enum { OTHER_VALUE = 0xFFFFFFFFFFFFFFFF } enum_field; + double float_field; }; /* Error case of two candidates with the fields (int_field) at the same @@ -839,6 +826,7 @@ struct core_reloc_size___err_ambiguous1 { int arr_field[4]; void *ptr_field; enum { VALUE___1 = 123 } enum_field; + float float_field; }; struct core_reloc_size___err_ambiguous2 { @@ -850,6 +838,7 @@ struct core_reloc_size___err_ambiguous2 { int arr_field[4]; void *ptr_field; enum { VALUE___2 = 123 } enum_field; + float float_field; }; /* diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 5f645fdaba6f..52a550d281d9 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -64,7 +64,7 @@ __u64 test7_result = 0; SEC("fentry/bpf_fentry_test7") int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { - if (arg == 0) + if (!arg) test7_result = 1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/fexit_sleep.c b/tools/testing/selftests/bpf/progs/fexit_sleep.c new file mode 100644 index 000000000000..03a672d76353 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fexit_sleep.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char LICENSE[] SEC("license") = "GPL"; + +int pid = 0; +int fentry_cnt = 0; +int fexit_cnt = 0; + +SEC("fentry/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fentry_cnt++; + return 0; +} + +SEC("fexit/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fexit, const struct pt_regs *regs, int ret) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fexit_cnt++; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 0952affb22a6..8f1ccb7302e1 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -65,7 +65,7 @@ __u64 test7_result = 0; SEC("fexit/bpf_fentry_test7") int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { - if (arg == 0) + if (!arg) test7_result = 1; return 0; } @@ -74,7 +74,7 @@ __u64 test8_result = 0; SEC("fexit/bpf_fentry_test8") int BPF_PROG(test8, struct bpf_fentry_test_t *arg) { - if (arg->a == 0) + if (!arg->a) test8_result = 1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c new file mode 100644 index 000000000000..75e8e1069fe7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + int output; +}; + +static __u64 +check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + if (*key == 1) + return 1; /* stop the iteration */ + return 0; +} + +__u32 cpu = 0; +__u64 percpu_val = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + cpu = bpf_get_smp_processor_id(); + percpu_val = *val; + return 0; +} + +u32 arraymap_output = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + bpf_for_each_map_elem(&arraymap, check_array_elem, &data, 0); + arraymap_output = data.output; + + bpf_for_each_map_elem(&percpu_map, check_percpu_elem, (void *)0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c new file mode 100644 index 000000000000..913dd91aafff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + struct __sk_buff *ctx; + int input; + int output; +}; + +static __u64 +check_hash_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + struct __sk_buff *skb = data->ctx; + __u32 k; + __u64 v; + + if (skb) { + k = *key; + v = *val; + if (skb->len == 10000 && k == 10 && v == 10) + data->output = 3; /* impossible path */ + else + data->output = 4; + } else { + data->output = data->input; + bpf_map_delete_elem(map, key); + } + + return 0; +} + +__u32 cpu = 0; +__u32 percpu_called = 0; +__u32 percpu_key = 0; +__u64 percpu_val = 0; +int percpu_output = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *unused) +{ + struct callback_ctx data; + + percpu_called++; + cpu = bpf_get_smp_processor_id(); + percpu_key = *key; + percpu_val = *val; + + data.ctx = 0; + data.input = 100; + data.output = 0; + bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + percpu_output = data.output; + + return 0; +} + +int hashmap_output = 0; +int hashmap_elems = 0; +int percpu_map_elems = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.ctx = skb; + data.input = 10; + data.output = 0; + hashmap_elems = bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + hashmap_output = data.output; + + percpu_map_elems = bpf_for_each_map_elem(&percpu_map, check_percpu_elem, + (void *)0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c new file mode 100644 index 000000000000..470f8723e463 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; + +SEC("classifier") +int kfunc_call_test2(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + return bpf_kfunc_call_test2((struct sock *)sk, 1, 2); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + __u64 a = 1ULL << 32; + __u32 ret; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4); + ret = a >> 32; /* ret should be 2 */ + ret += (__u32)a; /* ret should be 12 */ + + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c new file mode 100644 index 000000000000..b2dcb7d9cb03 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +extern const int bpf_prog_active __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; +extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; +int active_res = -1; +int sk_state = -1; + +int __noinline f1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + int *active; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, + bpf_get_smp_processor_id()); + if (active) + active_res = *active; + + sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + + return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + return f1(skb); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c new file mode 100644 index 000000000000..b964ec1390c2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* weak and shared between two files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val1; +int output_ctx1; +int output_weak1; + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 1; +} + +/* Global functions can't be void */ +int set_output_val1(int x) +{ + output_val1 = x + subprog(x); + return x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx1(__u64 *ctx) +{ + output_ctx1 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should win because it's the first one */ +__weak int set_output_weak(int x) +{ + output_weak1 = x; + return x; +} + +extern int set_output_val2(int x); + +/* here we'll force set_output_ctx2() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx2(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val2(1000); + set_output_ctx2(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c new file mode 100644 index 000000000000..575e958e60b7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* weak and shared between both files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val2; +int output_ctx2; +int output_weak2; /* should stay zero */ + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +/* Global functions can't be void */ +int set_output_val2(int x) +{ + output_val2 = 2 * x + 2 * subprog(x); + return 2 * x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx2(__u64 *ctx) +{ + output_ctx2 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should lose, because it will be processed second */ +__weak int set_output_weak(int x) +{ + output_weak2 = x; + return 2 * x; +} + +extern int set_output_val1(int x); + +/* here we'll force set_output_ctx1() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx1(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val1(2000); + set_output_ctx1(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps1.c b/tools/testing/selftests/bpf/progs/linked_maps1.c new file mode 100644 index 000000000000..52291515cc72 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps1.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct my_key { long x; }; +struct my_value { long x; }; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct my_key); + __type(value, struct my_value); + __uint(max_entries, 16); +} map1 SEC(".maps"); + + /* Matches map2 definition in linked_maps2.c. Order of the attributes doesn't + * matter. + */ +typedef struct { + __uint(max_entries, 8); + __type(key, int); + __type(value, int); + __uint(type, BPF_MAP_TYPE_ARRAY); +} map2_t; + +extern map2_t map2 SEC(".maps"); + +/* This should be the winning map definition, but we have no way of verifying, + * so we just make sure that it links and works without errors + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first1; +int output_second1; +int output_weak1; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter1) +{ + /* update values with key = 1 */ + int key = 1, val = 1; + struct my_key key_struct = { .x = 1 }; + struct my_value val_struct = { .x = 1000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit1) +{ + /* lookup values with key = 2, set in another file */ + int key = 2, *val; + struct my_key key_struct = { .x = 2 }; + struct my_value *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first1 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second1 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak1 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps2.c b/tools/testing/selftests/bpf/progs/linked_maps2.c new file mode 100644 index 000000000000..0693687474ed --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps2.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* modifiers and typedefs are ignored when comparing key/value types */ +typedef struct my_key { long x; } key_type; +typedef struct my_value { long x; } value_type; + +extern struct { + __uint(max_entries, 16); + __type(key, key_type); + __type(value, value_type); + __uint(type, BPF_MAP_TYPE_HASH); +} map1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 8); +} map2 SEC(".maps"); + +/* this definition will lose, but it has to exactly match the winner */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first2; +int output_second2; +int output_weak2; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter2) +{ + /* update values with key = 2 */ + int key = 2, val = 2; + key_type key_struct = { .x = 2 }; + value_type val_struct = { .x = 2000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit2) +{ + /* lookup values with key = 1, set in another file */ + int key = 1, *val; + key_type key_struct = { .x = 1 }; + value_type *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first2 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second2 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak2 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars1.c b/tools/testing/selftests/bpf/progs/linked_vars1.c new file mode 100644 index 000000000000..ef9e9d0bb0ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars1.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern int LINUX_KERNEL_VERSION __kconfig; +/* this weak extern will be strict due to the other file's strong extern */ +extern bool CONFIG_BPF_SYSCALL __kconfig __weak; +extern const void bpf_link_fops __ksym __weak; + +int input_bss1; +int input_data1 = 1; +const volatile int input_rodata1 = 11; + +int input_bss_weak __weak; +/* these two definitions should win */ +int input_data_weak __weak = 10; +const volatile int input_rodata_weak __weak = 100; + +extern int input_bss2; +extern int input_data2; +extern const int input_rodata2; + +int output_bss1; +int output_data1; +int output_rodata1; + +long output_sink1; + +static __noinline int get_bss_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_bss1 + input_bss2 + input_bss_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1) +{ + output_bss1 = get_bss_res(); + output_data1 = input_data1 + input_data2 + input_data_weak; + output_rodata1 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink1 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&bpf_link_fops; + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars2.c b/tools/testing/selftests/bpf/progs/linked_vars2.c new file mode 100644 index 000000000000..e4f5bd388a3c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars2.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern int LINUX_KERNEL_VERSION __kconfig; +/* when an extern is defined as both strong and weak, resulting symbol will be strong */ +extern bool CONFIG_BPF_SYSCALL __kconfig; +extern const void __start_BTF __ksym; + +int input_bss2; +int input_data2 = 2; +const volatile int input_rodata2 = 22; + +int input_bss_weak __weak; +/* these two weak variables should lose */ +int input_data_weak __weak = 20; +const volatile int input_rodata_weak __weak = 200; + +extern int input_bss1; +extern int input_data1; +extern const int input_rodata1; + +int output_bss2; +int output_data2; +int output_rodata2; + +int output_sink2; + +static __noinline int get_data_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_data1 + input_data2 + input_data_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2) +{ + output_bss2 = input_bss1 + input_bss2 + input_bss_weak; + output_data2 = get_data_res(); + output_rodata2 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink2 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&__start_BTF; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c new file mode 100644 index 000000000000..38de0331e6b4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/ptrace.h> +#include <stddef.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +/* typically virtio scsi has max SGs of 6 */ +#define VIRTIO_MAX_SGS 6 + +/* Verifier will fail with SG_MAX = 128. The failure can be + * workarounded with a smaller SG_MAX, e.g. 10. + */ +#define WORKAROUND +#ifdef WORKAROUND +#define SG_MAX 10 +#else +/* typically virtio blk has max SEG of 128 */ +#define SG_MAX 128 +#endif + +#define SG_CHAIN 0x01UL +#define SG_END 0x02UL + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; +}; + +#define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) +#define sg_is_last(sg) ((sg)->page_link & SG_END) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END))) + +static inline struct scatterlist *__sg_next(struct scatterlist *sgp) +{ + struct scatterlist sg; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_last(&sg)) + return NULL; + + sgp++; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_chain(&sg)) + sgp = sg_chain_ptr(&sg); + + return sgp; +} + +static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) +{ + struct scatterlist *sgp; + + bpf_probe_read_kernel(&sgp, sizeof(sgp), sgs + i); + return sgp; +} + +int config = 0; +int result = 0; + +SEC("kprobe/virtqueue_add_sgs") +int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) +{ + struct scatterlist *sgp = NULL; + __u64 length1 = 0, length2 = 0; + unsigned int i, n, len; + + if (config != 0) + return 0; + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length1 += len; + n++; + } + } + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < in_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length2 += len; + n++; + } + } + + config = 1; + result = length2 - length1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index d8850bc6a9f1..d1d304c980f0 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -12,6 +12,7 @@ _Static_assert(MAX_ENTRIES < LOOP_BOUND, "MAX_ENTRIES must be < LOOP_BOUND"); enum bpf_map_type g_map_type = BPF_MAP_TYPE_UNSPEC; __u32 g_line = 0; +int page_size = 0; /* userspace should set it */ #define VERIFY_TYPE(type, func) ({ \ g_map_type = type; \ @@ -635,7 +636,6 @@ struct bpf_ringbuf_map { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } m_ringbuf SEC(".maps"); static inline int check_ringbuf(void) @@ -643,7 +643,7 @@ static inline int check_ringbuf(void) struct bpf_ringbuf_map *ringbuf = (struct bpf_ringbuf_map *)&m_ringbuf; struct bpf_map *map = (struct bpf_map *)&m_ringbuf; - VERIFY(check(&ringbuf->map, map, 0, 0, 1 << 12)); + VERIFY(check(&ringbuf->map, map, 0, 0, page_size)); return 1; } diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index cf6823f42e80..7f2eaa2f89f8 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -4,7 +4,6 @@ #include <bpf/bpf_core_read.h> #include <bpf/bpf_helpers.h> -#define NULL 0 #define INLINE __always_inline #define skb_shorter(skb, len) ((void *)(long)(skb)->data + (len) > (void *)(long)skb->data_end) diff --git a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c index fdb4bf4408fa..eeaf6e75c9a2 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c @@ -8,18 +8,6 @@ int _version SEC("version") = 1; SEC("sk_msg1") int bpf_prog1(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - - char *d; - - if (data + 8 > data_end) - return SK_DROP; - - bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data); - d = (char *)data; - bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]); - return SK_PASS; } diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index d3597f81e6e9..8acdb99b5959 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -6,11 +6,8 @@ #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 -#endif +int page_size = 0; /* userspace should set it */ #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -90,7 +87,7 @@ int _getsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; @@ -161,7 +158,7 @@ int _setsockopt(struct bpf_sockopt *ctx) if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { /* Original optlen is larger than PAGE_SIZE. */ - if (ctx->optlen != PAGE_SIZE * 2) + if (ctx->optlen != page_size * 2) return 0; /* EPERM, unexpected data size */ if (optval + 1 > optval_end) @@ -175,7 +172,7 @@ int _setsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c new file mode 100644 index 000000000000..80a0a20db88d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} enter_id SEC(".maps"); + +#define MAGIC_VALUE 0xabcd1234 + +pid_t target_pid = 0; +int mismatch_cnt = 0; +int enter_cnt = 0; +int exit_cnt = 0; + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&enter_cnt, 1); + *ptr = MAGIC_VALUE + enter_cnt; + + return 0; +} + +SEC("tp_btf/sys_exit") +int BPF_PROG(on_exit, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&exit_cnt, 1); + if (*ptr != MAGIC_VALUE + exit_cnt) + __sync_fetch_and_add(&mismatch_cnt, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c new file mode 100644 index 000000000000..81758c0aef99 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, __u64); +} task_storage SEC(".maps"); + +int valid_ptr_count = 0; +int null_ptr_count = 0; + +SEC("fentry/exit_creds") +int BPF_PROG(trace_exit_creds, struct task_struct *task) +{ + __u64 *ptr; + + ptr = bpf_task_storage_get(&task_storage, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + __sync_fetch_and_add(&valid_ptr_count, 1); + else + __sync_fetch_and_add(&null_ptr_count, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c new file mode 100644 index 000000000000..564583dca7c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_a SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_b SEC(".maps"); + +SEC("fentry/bpf_local_storage_lookup") +int BPF_PROG(on_lookup) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + bpf_task_storage_delete(&map_a, task); + bpf_task_storage_delete(&map_b, task); + return 0; +} + +SEC("fentry/bpf_local_storage_update") +int BPF_PROG(on_update) +{ + struct task_struct *task = bpf_get_current_task_btf(); + long *ptr; + + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + return 0; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 200; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 100; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index b7787b43f9db..c4a9bae96e75 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -105,6 +105,54 @@ int xdp_minus_delta(struct xdp_md *ctx) return retval; } +SEC("xdp") +int xdp_input_len(struct xdp_md *ctx) +{ + int retval = XDP_PASS; /* Expected retval on successful test */ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input len is L3, like MTU and iph->tot_len. + * Remember XDP data_len is L2. + */ + __u32 mtu_len = data_len - ETH_HLEN; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = XDP_ABORTED; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("xdp") +int xdp_input_len_exceed(struct xdp_md *ctx) +{ + int retval = XDP_ABORTED; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = XDP_PASS ; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + SEC("classifier") int tc_use_helper(struct __sk_buff *ctx) { @@ -196,3 +244,47 @@ int tc_minus_delta(struct __sk_buff *ctx) global_bpf_mtu_xdp = mtu_len; return retval; } + +SEC("classifier") +int tc_input_len(struct __sk_buff *ctx) +{ + int retval = BPF_OK; /* Expected retval on successful test */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = BPF_DROP; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("classifier") +int tc_input_len_exceed(struct __sk_buff *ctx) +{ + int retval = BPF_DROP; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = BPF_OK; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index d7fb6cfc7891..7b2d576aeea1 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -21,6 +21,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -30,6 +31,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; SEC("raw_tracepoint/sys_enter") @@ -45,6 +47,7 @@ int test_core_size(void *ctx) out->arr_elem_sz = bpf_core_field_size(in->arr_field[0]); out->ptr_sz = bpf_core_field_size(in->ptr_field); out->enum_sz = bpf_core_field_size(in->enum_field); + out->float_sz = bpf_core_field_size(in->float_field); return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 61c2ae92ce41..97b7031d0e22 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -14,7 +14,7 @@ struct Big { __noinline int foo(const struct Big *big) { - if (big == 0) + if (!big) return 0; return bpf_get_prandom_u32() < big->y; diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 4eb42cff5fe9..5a5cc19a15bf 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -9,7 +9,6 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4096); __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); __type(key, __u32); __type(value, char); @@ -17,7 +16,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 512 * 4); /* at least 4 pages of data */ __uint(map_flags, BPF_F_MMAPABLE); __type(key, __u32); __type(value, __u64); diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c index 8ba9959b036b..6b3f288b7c63 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -15,7 +15,6 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf SEC(".maps"); /* inputs */ diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c index edf3b6953533..197b86546dca 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -15,7 +15,6 @@ struct sample { struct ringbuf_map { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf1 SEC(".maps"), ringbuf2 SEC(".maps"); @@ -31,6 +30,17 @@ struct { }, }; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_hash SEC(".maps") = { + .values = { + [0] = &ringbuf1, + }, +}; + /* inputs */ int pid = 0; int target_ring = 0; diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c index 1032b292af5b..ac6f7f205e25 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c @@ -64,6 +64,10 @@ static const int PROG_DONE = 1; static const __u32 KEY_SERVER_A = SERVER_A; static const __u32 KEY_SERVER_B = SERVER_B; +static const __u16 SRC_PORT = bpf_htons(8008); +static const __u32 SRC_IP4 = IP4(127, 0, 0, 2); +static const __u32 SRC_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000002); + static const __u16 DST_PORT = 7007; /* Host byte order */ static const __u32 DST_IP4 = IP4(127, 0, 0, 1); static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001); @@ -398,11 +402,12 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) if (LSW(ctx->protocol, 0) != IPPROTO_TCP) return SK_DROP; - /* Narrow loads from remote_port field. Expect non-0 value. */ - if (LSB(ctx->remote_port, 0) == 0 && LSB(ctx->remote_port, 1) == 0 && - LSB(ctx->remote_port, 2) == 0 && LSB(ctx->remote_port, 3) == 0) + /* Narrow loads from remote_port field. Expect SRC_PORT. */ + if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) || + LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) || + LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0) return SK_DROP; - if (LSW(ctx->remote_port, 0) == 0) + if (LSW(ctx->remote_port, 0) != SRC_PORT) return SK_DROP; /* Narrow loads from local_port field. Expect DST_PORT. */ @@ -415,11 +420,14 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv4 fields */ if (v4) { - /* Expect non-0.0.0.0 in remote_ip4 */ - if (LSB(ctx->remote_ip4, 0) == 0 && LSB(ctx->remote_ip4, 1) == 0 && - LSB(ctx->remote_ip4, 2) == 0 && LSB(ctx->remote_ip4, 3) == 0) + /* Expect SRC_IP4 in remote_ip4 */ + if (LSB(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xff) || + LSB(ctx->remote_ip4, 1) != ((SRC_IP4 >> 8) & 0xff) || + LSB(ctx->remote_ip4, 2) != ((SRC_IP4 >> 16) & 0xff) || + LSB(ctx->remote_ip4, 3) != ((SRC_IP4 >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip4, 0) == 0 && LSW(ctx->remote_ip4, 1) == 0) + if (LSW(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xffff) || + LSW(ctx->remote_ip4, 1) != ((SRC_IP4 >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP4 in local_ip4 */ @@ -448,20 +456,32 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv6 fields */ if (!v4) { - /* Expect non-:: IP in remote_ip6 */ - if (LSB(ctx->remote_ip6[0], 0) == 0 && LSB(ctx->remote_ip6[0], 1) == 0 && - LSB(ctx->remote_ip6[0], 2) == 0 && LSB(ctx->remote_ip6[0], 3) == 0 && - LSB(ctx->remote_ip6[1], 0) == 0 && LSB(ctx->remote_ip6[1], 1) == 0 && - LSB(ctx->remote_ip6[1], 2) == 0 && LSB(ctx->remote_ip6[1], 3) == 0 && - LSB(ctx->remote_ip6[2], 0) == 0 && LSB(ctx->remote_ip6[2], 1) == 0 && - LSB(ctx->remote_ip6[2], 2) == 0 && LSB(ctx->remote_ip6[2], 3) == 0 && - LSB(ctx->remote_ip6[3], 0) == 0 && LSB(ctx->remote_ip6[3], 1) == 0 && - LSB(ctx->remote_ip6[3], 2) == 0 && LSB(ctx->remote_ip6[3], 3) == 0) + /* Expect SRC_IP6 in remote_ip6 */ + if (LSB(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xff) || + LSB(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 8) & 0xff) || + LSB(ctx->remote_ip6[0], 2) != ((SRC_IP6[0] >> 16) & 0xff) || + LSB(ctx->remote_ip6[0], 3) != ((SRC_IP6[0] >> 24) & 0xff) || + LSB(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xff) || + LSB(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 8) & 0xff) || + LSB(ctx->remote_ip6[1], 2) != ((SRC_IP6[1] >> 16) & 0xff) || + LSB(ctx->remote_ip6[1], 3) != ((SRC_IP6[1] >> 24) & 0xff) || + LSB(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xff) || + LSB(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 8) & 0xff) || + LSB(ctx->remote_ip6[2], 2) != ((SRC_IP6[2] >> 16) & 0xff) || + LSB(ctx->remote_ip6[2], 3) != ((SRC_IP6[2] >> 24) & 0xff) || + LSB(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xff) || + LSB(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 8) & 0xff) || + LSB(ctx->remote_ip6[3], 2) != ((SRC_IP6[3] >> 16) & 0xff) || + LSB(ctx->remote_ip6[3], 3) != ((SRC_IP6[3] >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip6[0], 0) == 0 && LSW(ctx->remote_ip6[0], 1) == 0 && - LSW(ctx->remote_ip6[1], 0) == 0 && LSW(ctx->remote_ip6[1], 1) == 0 && - LSW(ctx->remote_ip6[2], 0) == 0 && LSW(ctx->remote_ip6[2], 1) == 0 && - LSW(ctx->remote_ip6[3], 0) == 0 && LSW(ctx->remote_ip6[3], 1) == 0) + if (LSW(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP6 in local_ip6 */ if (LSB(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xff) || diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c new file mode 100644 index 000000000000..951a0301c553 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char num_out[64] = {}; +long num_ret = 0; + +char ip_out[64] = {}; +long ip_ret = 0; + +char sym_out[64] = {}; +long sym_ret = 0; + +char addr_out[64] = {}; +long addr_ret = 0; + +char str_out[64] = {}; +long str_ret = 0; + +char over_out[6] = {}; +long over_ret = 0; + +char pad_out[10] = {}; +long pad_ret = 0; + +char noarg_out[64] = {}; +long noarg_ret = 0; + +long nobuf_ret = 0; + +extern const void schedule __ksym; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + /* Convenient values to pretty-print */ + const __u8 ex_ipv4[] = {127, 0, 0, 1}; + const __u8 ex_ipv6[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + static const char str1[] = "str1"; + static const char longstr[] = "longstr"; + + /* Integer types */ + num_ret = BPF_SNPRINTF(num_out, sizeof(num_out), + "%d %u %x %li %llu %lX", + -8, 9, 150, -424242, 1337, 0xDABBAD00); + /* IP addresses */ + ip_ret = BPF_SNPRINTF(ip_out, sizeof(ip_out), "%pi4 %pI6", + &ex_ipv4, &ex_ipv6); + /* Symbol lookup formatting */ + sym_ret = BPF_SNPRINTF(sym_out, sizeof(sym_out), "%ps %pS %pB", + &schedule, &schedule, &schedule); + /* Kernel pointers */ + addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p", + 0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55); + /* Strings embedding */ + str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s", + str1, longstr); + /* Overflow */ + over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow"); + /* Padding of fixed width numbers */ + pad_ret = BPF_SNPRINTF(pad_out, sizeof(pad_out), "%5d %0900000X", 4, 4); + /* No args */ + noarg_ret = BPF_SNPRINTF(noarg_out, sizeof(noarg_out), "simple case"); + /* No buffer */ + nobuf_ret = BPF_SNPRINTF(NULL, 0, "only interested in length %d", 60); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf_single.c b/tools/testing/selftests/bpf/progs/test_snprintf_single.c new file mode 100644 index 000000000000..402adaf344f9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf_single.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* The format string is filled from the userspace such that loading fails */ +static const char fmt[10]; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + unsigned long long arg = 42; + + bpf_snprintf(NULL, 0, fmt, &arg, sizeof(arg)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..a39eba9f5201 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -29,15 +29,16 @@ struct { } verdict_map SEC(".maps"); static volatile bool test_sockmap; /* toggled by user-space */ +static volatile bool test_ingress; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; @@ -55,6 +56,27 @@ int prog_skb_verdict(struct __sk_buff *skb) return verdict; } +SEC("sk_skb/skb_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_sk_redirect_map(skb, &sock_map, zero, + test_ingress ? BPF_F_INGRESS : 0); + else + verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, + test_ingress ? BPF_F_INGRESS : 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + SEC("sk_msg") int prog_msg_verdict(struct sk_msg_md *msg) { diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c new file mode 100644 index 000000000000..2d31f66e4f23 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +SEC("sk_skb/skb_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_DROP; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_static_linked1.c b/tools/testing/selftests/bpf/progs/test_static_linked1.c new file mode 100644 index 000000000000..ea1a6c4c7172 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_static_linked1.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* 8-byte aligned .bss */ +static volatile long static_var1; +static volatile int static_var11; +int var1 = 0; +/* 4-byte aligned .rodata */ +const volatile int rovar1; + +/* same "subprog" name in both files */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +SEC("raw_tp/sys_enter") +int handler1(const void *ctx) +{ + var1 = subprog(rovar1) + static_var1 + static_var11; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +int VERSION SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_static_linked2.c b/tools/testing/selftests/bpf/progs/test_static_linked2.c new file mode 100644 index 000000000000..54d8d1ab577c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_static_linked2.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* 4-byte aligned .bss */ +static volatile int static_var2; +static volatile int static_var22; +int var2 = 0; +/* 8-byte aligned .rodata */ +const volatile long rovar2; + +/* same "subprog" name in both files */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 3; +} + +SEC("raw_tp/sys_enter") +int handler2(const void *ctx) +{ + var2 = subprog(rovar2) + static_var2 + static_var22; + + return 0; +} + +/* different name and/or type of the variable doesn't matter */ +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 37bce7a7c394..84cd63259554 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -24,14 +24,29 @@ static const int cfg_port = 8000; static const int cfg_udp_src = 20000; +#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) + #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 #define ETH_OVER_UDP_PORT 7777 +#define VXLAN_UDP_PORT 8472 + +#define EXTPROTO_VXLAN 0x1 + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) +#define VXLAN_FLAGS 0x8 +#define VXLAN_VNI 1 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +} __attribute__((packed)); + struct gre_hdr { __be16 flags; __be16 protocol; @@ -45,13 +60,13 @@ union l4hdr { struct v4hdr { struct iphdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); struct v6hdr { struct ipv6hdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); static __always_inline void set_ipv4_csum(struct iphdr *iph) @@ -69,14 +84,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, - __u16 l2_proto) +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) { __u16 udp_dst = UDP_PORT; struct iphdr iph_inner; struct v4hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; int tcp_off; __u64 flags; @@ -141,7 +157,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -171,14 +191,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; + break; } olen += l2_len; @@ -214,14 +246,21 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, +static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto) { + return __encap_ipv4(skb, encap_proto, l2_proto, 0); +} + +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) +{ __u16 udp_dst = UDP_PORT; struct ipv6hdr iph_inner; struct v6hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; __u16 tot_len; __u64 flags; @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src); h_outer.l4hdr.udp.dest = bpf_htons(udp_dst); tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) + - sizeof(h_outer.l4hdr.udp); + sizeof(h_outer.l4hdr.udp) + l2_len; h_outer.l4hdr.udp.check = 0; h_outer.l4hdr.udp.len = bpf_htons(tot_len); break; @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; break; } @@ -309,6 +363,12 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } +static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto) +{ + return __encap_ipv6(skb, encap_proto, l2_proto, 0); +} + SEC("encap_ipip_none") int __encap_ipip_none(struct __sk_buff *skb) { @@ -372,6 +432,17 @@ int __encap_udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_vxlan_eth") +int __encap_vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return __encap_ipv4(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + SEC("encap_sit_none") int __encap_sit_none(struct __sk_buff *skb) { @@ -444,6 +515,17 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_ip6vxlan_eth") +int __encap_ip6vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return __encap_ipv6(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { char buf[sizeof(struct v6hdr)]; @@ -479,6 +561,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) case ETH_OVER_UDP_PORT: olen += ETH_HLEN; break; + case VXLAN_UDP_PORT: + olen += ETH_HLEN + sizeof(struct vxlanhdr); + break; } break; default: diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 9afe947cfae9..e7b673117436 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -396,7 +396,7 @@ int _ip6vxlan_get_tunnel(struct __sk_buff *skb) SEC("geneve_set_tunnel") int _geneve_set_tunnel(struct __sk_buff *skb) { - int ret, ret2; + int ret; struct bpf_tunnel_key key; struct geneve_opt gopt; @@ -508,10 +508,8 @@ int _ip6geneve_get_tunnel(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } + if (ret < 0) + gopt.opt_class = 0; bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4, gopt.opt_class); diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 2db3c60e1e61..ac349a5cea7e 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -85,23 +85,6 @@ make_with_tmpdir() { echo } -make_doc_and_clean() { - echo -e "\$PWD: $PWD" - echo -e "command: make -s $* doc >/dev/null" - RST2MAN_OPTS="--exit-status=1" make $J -s $* doc - if [ $? -ne 0 ] ; then - ERROR=1 - printf "FAILURE: Errors or warnings when building documentation\n" - fi - ( - if [ $# -ge 1 ] ; then - cd ${@: -1} - fi - make -s doc-clean - ) - echo -} - echo "Trying to build bpftool" echo -e "... through kbuild\n" @@ -162,7 +145,3 @@ make_and_clean make_with_tmpdir OUTPUT make_with_tmpdir O - -echo -e "Checking documentation build\n" -# From tools/bpf/bpftool -make_doc_and_clean diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 2023725f1962..e2394eea4b7f 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -66,4 +66,7 @@ #define BTF_FUNC_ENC(name, func_proto) \ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) + #endif /* _TEST_BTF_H */ diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh new file mode 100755 index 000000000000..7eb940a7b2eb --- /dev/null +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +# Assume script is located under tools/testing/selftests/bpf/. We want to start +# build attempts from the top of kernel repository. +SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) +KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +cd $KDIR_ROOT_DIR + +for tgt in docs docs-clean; do + make -s -C $PWD/$SCRIPT_REL_DIR $tgt; +done diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f7c2fd89d01a..dda52cb649dc 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -130,6 +130,20 @@ extern int test__join_cgroup(const char *path); #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) +#define ASSERT_TRUE(actual, name) ({ \ + static int duration = 0; \ + bool ___ok = (actual); \ + CHECK(!___ok, (name), "unexpected %s: got FALSE\n", (name)); \ + ___ok; \ +}) + +#define ASSERT_FALSE(actual, name) ({ \ + static int duration = 0; \ + bool ___ok = !(actual); \ + CHECK(!___ok, (name), "unexpected %s: got TRUE\n", (name)); \ + ___ok; \ +}) + #define ASSERT_EQ(actual, expected, name) ({ \ static int duration = 0; \ typeof(actual) ___act = (actual); \ @@ -152,6 +166,50 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_LT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act < ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld >= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_LE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act <= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld > expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act > ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld <= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act >= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld < expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ @@ -167,7 +225,8 @@ extern int test__join_cgroup(const char *path); static int duration = 0; \ long long ___res = (res); \ bool ___ok = ___res == 0; \ - CHECK(!___ok, (name), "unexpected error: %lld\n", ___res); \ + CHECK(!___ok, (name), "unexpected error: %lld (errno %d)\n", \ + ___res, errno); \ ___ok; \ }) @@ -199,7 +258,7 @@ extern int test__join_cgroup(const char *path); #define ASSERT_ERR_PTR(ptr, name) ({ \ static int duration = 0; \ const void *___res = (ptr); \ - bool ___ok = IS_ERR(___res) \ + bool ___ok = IS_ERR(___res); \ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \ ___ok; \ }) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 427ca00a3217..eefd445b96fc 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -732,7 +732,7 @@ static int sendmsg_test(struct sockmap_options *opt) * socket is not a valid test. So in this case lets not * enable kTLS but still run the test. */ - if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) { + if (!txmsg_redir || txmsg_ingress) { err = sockmap_init_ktls(opt->verbose, rx_fd); if (err) return err; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7c76b841b17b..c9dde9b9d987 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -44,8 +44,8 @@ setup() { # clamp route to reserve room for tunnel headers ip -netns "${ns1}" -4 route flush table main ip -netns "${ns1}" -6 route flush table main - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1 - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1 + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 sleep 1 @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then echo "sit" $0 ipv6 sit none 100 + echo "ip4 vxlan" + $0 ipv4 vxlan eth 2000 + + echo "ip6 vxlan" + $0 ipv6 ip6vxlan eth 2000 + for mac in none mpls eth ; do echo "ip gre $mac" $0 ipv4 gre $mac 100 @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then targs="encap fou encap-sport auto encap-dport $dport" elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then ttype=$gretaptype +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then + ttype="vxlan" + targs="id 1 dstport 8472 udp6zerocsumrx" else ttype=$tuntype targs="" @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then # No support for TEB fou tunnel; expect failure. expect_tun_fail=1 -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then # Share ethernet address between tunnel/veth2 so L2 decap works. ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ awk '/ether/ { print $2 }') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 58b5a349d3ba..1512092e1e68 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -105,7 +105,7 @@ struct bpf_test { enum bpf_prog_type prog_type; uint8_t flags; void (*fill_helper)(struct bpf_test *self); - uint8_t runs; + int runs; #define bpf_testdata_struct_t \ struct { \ uint32_t retval, retval_unpriv; \ @@ -1165,7 +1165,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, run_errs = 0; run_successes = 0; - if (!alignment_prevented_execution && fd_prog >= 0) { + if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) { uint32_t expected_val; int i; diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 88a7483eaae4..46633a3bfb0b 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -71,13 +71,21 @@ # # Run (full output without color-coding): # sudo ./test_xsk.sh +# +# Run with verbose output: +# sudo ./test_xsk.sh -v +# +# Run and dump packet contents: +# sudo ./test_xsk.sh -D . xsk_prereqs.sh -while getopts c flag +while getopts "cvD" flag do case "${flag}" in c) colorconsole=1;; + v) verbose=1;; + D) dump_pkts=1;; esac done @@ -95,17 +103,22 @@ NS1=af_xdp${VETH1_POSTFIX} MTU=1500 setup_vethPairs() { - echo "setting up ${VETH0}: namespace: ${NS0}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH0}: namespace: ${NS0}" + fi ip netns add ${NS1} - ip link add ${VETH0} type veth peer name ${VETH1} + ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4 if [ -f /proc/net/if_inet6 ]; then echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 fi - echo "setting up ${VETH1}: namespace: ${NS1}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH1}: namespace: ${NS1}" + fi ip link set ${VETH1} netns ${NS1} ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} ip link set ${VETH0} mtu ${MTU} ip netns exec ${NS1} ip link set ${VETH1} up + ip netns exec ${NS1} ip link set dev lo up ip link set ${VETH0} up } @@ -125,121 +138,24 @@ echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE} validate_veth_spec_file -echo "Spec file created: ${SPECFILE}" - -test_status $retval "${TEST_NAME}" - -## START TESTS - -statusList=() - -### TEST 1 -TEST_NAME="XSK KSELFTEST FRAMEWORK" - -echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -retval=$? -if [ $retval -eq 0 ]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" - vethXDPnative ${VETH0} ${VETH1} ${NS1} +if [[ $verbose -eq 1 ]]; then + echo "Spec file created: ${SPECFILE}" + VERBOSE_ARG="-v" fi -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 2 -TEST_NAME="SKB NOPOLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 3 -TEST_NAME="SKB POLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 4 -TEST_NAME="DRV NOPOLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 5 -TEST_NAME="DRV POLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 6 -TEST_NAME="SKB SOCKET TEARDOWN" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-T") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 7 -TEST_NAME="DRV SOCKET TEARDOWN" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-T") -execxdpxceiver params +if [[ $dump_pkts -eq 1 ]]; then + DUMP_PKTS_ARG="-D" +fi -retval=$? test_status $retval "${TEST_NAME}" -statusList+=($retval) -### TEST 8 -TEST_NAME="SKB BIDIRECTIONAL SOCKETS" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-B") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) +## START TESTS -### TEST 9 -TEST_NAME="DRV BIDIRECTIONAL SOCKETS" +statusList=() -vethXDPnative ${VETH0} ${VETH1} ${NS1} +TEST_NAME="XSK KSELFTESTS" -params=("-N" "-B") -execxdpxceiver params +execxdpxceiver retval=$? test_status $retval "${TEST_NAME}" diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index 1b138cd2b187..1b1c798e9248 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -186,7 +186,7 @@ }, .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", - .errstr = "invalid access to map value, value_size=48 off=44 size=8", + .errstr = "R0 unbounded memory access", .result_unpriv = REJECT, .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 57ed67b86074..8a1caf46ffbc 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -261,8 +261,6 @@ }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", - .result_unpriv = REJECT, .errstr = "value -4294967168 makes map_value pointer be out of bounds", .result = REJECT, }, @@ -298,9 +296,6 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - /* not actually fully unbounded, but the bound is very high */ - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", - .result_unpriv = REJECT, .errstr = "value -4294967168 makes map_value pointer be out of bounds", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds_deduction.c b/tools/testing/selftests/bpf/verifier/bounds_deduction.c index 1fd07a4f27ac..91869aea6d64 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_deduction.c +++ b/tools/testing/selftests/bpf/verifier/bounds_deduction.c @@ -6,8 +6,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 2", @@ -20,6 +21,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, }, @@ -31,20 +34,24 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 4", .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), - BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R6 has pointer with unsupported alu operation", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -55,8 +62,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 6", @@ -67,8 +75,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 7", @@ -80,8 +89,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -94,8 +104,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -106,8 +117,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 10", @@ -119,6 +131,6 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c index 9baca7a75c42..c2aa6f26738b 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c +++ b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c @@ -19,7 +19,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -43,7 +42,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -69,7 +67,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R8 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -94,7 +91,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R8 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -141,7 +137,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -210,7 +205,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -260,7 +254,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -287,7 +280,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -313,7 +305,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -342,7 +333,6 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R7 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -372,7 +362,6 @@ }, .fixup_map_hash_8b = { 4 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -400,7 +389,5 @@ }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", - .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, - .result_unpriv = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c index 69b048cf46d9..3e024c891178 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c +++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c @@ -42,3 +42,46 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, +{ + "bpf_get_task_stack return R0 range is refined", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_6, 0), // ctx->meta->seq + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 8), // ctx->task + BPF_LD_MAP_FD(BPF_REG_1, 0), // fixup_map_array_48b + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // keep buf for seq_write + BPF_MOV64_IMM(BPF_REG_3, 48), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_EMIT_CALL(BPF_FUNC_get_task_stack), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_seq_write), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_ITER, + .kfunc = "task", + .runs = -1, // Don't run, just load + .fixup_map_array_48b = { 3 }, +}, diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index eb888c8479c3..336a749673d1 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -136,7 +136,7 @@ { "calls: wrong src reg", .insns = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0), BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c index fb13ca2d5606..d78627be060f 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c +++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c @@ -239,6 +239,7 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .runs = -1, }, /* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */ { diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 5cf361d8eb1c..17fe33a75034 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index b117bdd3806d..1f82021429bf 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -75,6 +75,8 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 has pointer with unsupported alu operation", .result = ACCEPT, }, { @@ -91,5 +93,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 has pointer with unsupported alu operation", .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index b018ad71e0a8..bd436df5cc32 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -497,7 +497,7 @@ .result = ACCEPT, }, { - "unpriv: adding of fp", + "unpriv: adding of fp, reg", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_1, 0), @@ -510,6 +510,19 @@ .result = ACCEPT, }, { + "unpriv: adding of fp, imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ "unpriv: cmp of stack pointer", .insns = { BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index ed4e76b24649..e5913fd3b903 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -21,8 +21,6 @@ .fixup_map_hash_16b = { 5 }, .fixup_map_array_48b = { 8 }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R1 tried to add from different maps", .retval = 1, }, { @@ -122,7 +120,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different pointers or scalars", + .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", .retval = 0, }, { @@ -169,7 +167,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps or paths", + .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", .retval = 0, }, { @@ -517,6 +515,27 @@ .retval = 0xabcdef12, }, { + "map access: value_ptr += N, value_ptr -= N known scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV32_IMM(BPF_REG_1, 0x12345678), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0x12345678, +}, +{ "map access: unknown scalar += value_ptr, 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 26ae8d0b6ce3..8889b3f55236 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -17,19 +17,22 @@ KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vm KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config" INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX" NUM_COMPILE_JOBS="$(nproc)" +LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")" +LOG_FILE="${LOG_FILE_BASE}.log" +EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status" usage() { cat <<EOF -Usage: $0 [-i] [-d <output_dir>] -- [<command>] +Usage: $0 [-i] [-s] [-d <output_dir>] -- [<command>] <command> is the command you would normally run when you are in tools/testing/selftests/bpf. e.g: $0 -- ./test_progs -t test_lsm -If no command is specified, "${DEFAULT_COMMAND}" will be run by -default. +If no command is specified and a debug shell (-s) is not requested, +"${DEFAULT_COMMAND}" will be run by default. If you build your kernel using KBUILD_OUTPUT= or O= options, these can be passed as environment variables to the script: @@ -46,6 +49,9 @@ Options: -d) Update the output directory (default: ${OUTPUT_DIR}) -j) Number of jobs for compilation, similar to -j in make (default: ${NUM_COMPILE_JOBS}) + -s) Instead of powering off the VM, start an interactive + shell. If <command> is specified, the shell runs after + the command finishes executing EOF } @@ -146,7 +152,7 @@ update_init_script() local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" local init_script="${init_script_dir}/S50-startup" local command="$1" - local log_file="$2" + local exit_command="$2" mount_image @@ -160,17 +166,26 @@ EOF fi - sudo bash -c "cat >${init_script}" <<EOF -#!/bin/bash + sudo bash -c "echo '#!/bin/bash' > ${init_script}" + + if [[ "${command}" != "" ]]; then + sudo bash -c "cat >>${init_script}" <<EOF +# Have a default value in the exit status file +# incase the VM is forcefully stopped. +echo "130" > "/root/${EXIT_STATUS_FILE}" { cd /root/bpf echo ${command} stdbuf -oL -eL ${command} -} 2>&1 | tee /root/${log_file} -poweroff -f + echo "\$?" > "/root/${EXIT_STATUS_FILE}" +} 2>&1 | tee "/root/${LOG_FILE}" +# Ensure that the logs are written to disk +sync EOF + fi + sudo bash -c "echo ${exit_command} >> ${init_script}" sudo chmod a+x "${init_script}" unmount_image } @@ -221,10 +236,12 @@ EOF copy_logs() { local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" - local log_file="${mount_dir}/root/$1" + local log_file="${mount_dir}/root/${LOG_FILE}" + local exit_status_file="${mount_dir}/root/${EXIT_STATUS_FILE}" mount_image sudo cp ${log_file} "${OUTPUT_DIR}" + sudo cp ${exit_status_file} "${OUTPUT_DIR}" sudo rm -f ${log_file} unmount_image } @@ -263,14 +280,15 @@ main() { local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" local kernel_checkout=$(realpath "${script_dir}"/../../../../) - local log_file="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S.log")" # By default the script searches for the kernel in the checkout directory but # it also obeys environment variables O= and KBUILD_OUTPUT= local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" local command="${DEFAULT_COMMAND}" local update_image="no" + local exit_command="poweroff -f" + local debug_shell="no" - while getopts 'hkid:j:' opt; do + while getopts 'hskid:j:' opt; do case ${opt} in i) update_image="yes" @@ -281,6 +299,11 @@ main() j) NUM_COMPILE_JOBS="$OPTARG" ;; + s) + command="" + debug_shell="yes" + exit_command="bash" + ;; h) usage exit 0 @@ -299,7 +322,7 @@ main() done shift $((OPTIND -1)) - if [[ $# -eq 0 ]]; then + if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" else command="$@" @@ -347,19 +370,25 @@ main() fi update_selftests "${kernel_checkout}" "${make_command}" - update_init_script "${command}" "${log_file}" + update_init_script "${command}" "${exit_command}" run_vm "${kernel_bzimage}" - copy_logs "${log_file}" - echo "Logs saved in ${OUTPUT_DIR}/${log_file}" + if [[ "${command}" != "" ]]; then + copy_logs + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" + fi } catch() { local exit_code=$1 + local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}" # This is just a cleanup and the directory may # have already been unmounted. So, don't let this # clobber the error code we intend to return. unmount_image || true + if [[ -f "${exit_status_file}" ]]; then + exit_code="$(cat ${exit_status_file})" + fi exit ${exit_code} } diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index f4a96d5ff524..1135fb980814 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -18,12 +18,7 @@ * These selftests test AF_XDP SKB and Native/DRV modes using veth * Virtual Ethernet interfaces. * - * The following tests are run: - * - * 1. AF_XDP SKB mode - * Generic mode XDP is driver independent, used when the driver does - * not have support for XDP. Works on any netdevice using sockets and - * generic XDP path. XDP hook from netif_receive_skb(). + * For each mode, the following tests are run: * a. nopoll - soft-irq processing * b. poll - using poll() syscall * c. Socket Teardown @@ -33,19 +28,25 @@ * Configure sockets as bi-directional tx/rx sockets, sets up fill and * completion rings on each socket, tx/rx in both directions. Only nopoll * mode is used + * e. Statistics + * Trigger some error conditions and ensure that the appropriate statistics + * are incremented. Within this test, the following statistics are tested: + * i. rx dropped + * Increase the UMEM frame headroom to a value which results in + * insufficient space in the rx buffer for both the packet and the headroom. + * ii. tx invalid + * Set the 'len' field of tx descriptors to an invalid value (umem frame + * size + 1). + * iii. rx ring full + * Reduce the size of the RX ring to a fraction of the fill ring size. + * iv. fill queue empty + * Do not populate the fill queue and then try to receive pkts. + * f. bpf_link resource persistence + * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0, + * then remove xsk sockets from queue 0 on both veth interfaces and + * finally run a traffic on queues ids 1 * - * 2. AF_XDP DRV/Native mode - * Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes - * packets before SKB allocation. Provides better performance than SKB. Driver - * hook available just after DMA of buffer descriptor. - * a. nopoll - * b. poll - * c. Socket Teardown - * d. Bi-directional sockets - * - Only copy mode is supported because veth does not currently support - * zero-copy mode - * - * Total tests: 8 + * Total tests: 12 * * Flow: * ----- @@ -58,7 +59,7 @@ * - Rx thread verifies if all 10k packets were received and delivered in-order, * and have the right content * - * Enable/disable debug mode: + * Enable/disable packet dump mode: * -------------------------- * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") @@ -96,35 +97,34 @@ typedef __u16 __sum16; #include "xdpxceiver.h" #include "../kselftest.h" +static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; +static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; +static const char *IP1 = "192.168.100.162"; +static const char *IP2 = "192.168.100.161"; +static const u16 UDP_PORT1 = 2020; +static const u16 UDP_PORT2 = 2121; + static void __exit_with_error(int error, const char *file, const char *func, int line) { - ksft_test_result_fail - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - ksft_exit_xfail(); + if (configured_mode == TEST_MODE_UNCONFIGURED) { + ksft_exit_fail_msg + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + } else { + ksft_test_result_fail + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + ksft_exit_xfail(); + } } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\ - "NOPOLL", opt_teardown ? "Socket Teardown" : "",\ - opt_bidi ? "Bi-directional Sockets" : "")) - -static void pthread_init_mutex(void) -{ - pthread_mutex_init(&sync_mutex, NULL); - pthread_mutex_init(&sync_mutex_tx, NULL); - pthread_cond_init(&signal_rx_condition, NULL); - pthread_cond_init(&signal_tx_condition, NULL); -} - -static void pthread_destroy_mutex(void) -{ - pthread_mutex_destroy(&sync_mutex); - pthread_mutex_destroy(&sync_mutex_tx); - pthread_cond_destroy(&signal_rx_condition); - pthread_cond_destroy(&signal_tx_condition); -} + (ksft_test_result_pass("PASS: %s %s %s%s%s%s\n", configured_mode ? "DRV" : "SKB",\ + test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ + test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ + test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ + test_type == TEST_TYPE_STATS ? "Stats" : "",\ + test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) static void *memset32_htonl(void *dest, u32 val, u32 size) { @@ -143,24 +143,11 @@ static void *memset32_htonl(void *dest, u32 val, u32 size) } /* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -/* * Fold a partial checksum * This function code has been taken from * Linux kernel include/asm-generic/checksum.h */ -static inline __u16 csum_fold(__u32 csum) +static __u16 csum_fold(__u32 csum) { u32 sum = (__force u32)csum; @@ -173,7 +160,7 @@ static inline __u16 csum_fold(__u32 csum) * This function code has been taken from * Linux kernel lib/checksum.c */ -static inline u32 from64to32(u64 x) +static u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); @@ -182,13 +169,11 @@ static inline u32 from64to32(u64 x) return (u32)x; } -__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum); - /* * This function code has been taken from * Linux kernel lib/checksum.c */ -__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) { unsigned long long s = (__force u32)sum; @@ -206,13 +191,12 @@ __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u3 * This function has been taken from * Linux kernel include/asm-generic/checksum.h */ -static inline __u16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } -static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) +static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) { u32 csum = 0; u32 cnt = 0; @@ -267,26 +251,37 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); } -static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) +static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) { + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; + int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + struct xsk_umem_info *umem; int ret; - data->umem = calloc(1, sizeof(struct xsk_umem_info)); - if (!data->umem) + umem = calloc(1, sizeof(struct xsk_umem_info)); + if (!umem) exit_with_error(errno); - ret = xsk_umem__create(&data->umem->umem, buffer, size, - &data->umem->fq, &data->umem->cq, NULL); + ret = xsk_umem__create(&umem->umem, buffer, size, + &umem->fq, &umem->cq, &cfg); if (ret) exit_with_error(ret); - data->umem->buffer = buffer; + umem->buffer = buffer; + + data->umem_arr[idx] = umem; } static void xsk_populate_fill_ring(struct xsk_umem_info *umem) { int ret, i; - u32 idx; + u32 idx = 0; ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) @@ -296,51 +291,48 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem) xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); } -static int xsk_configure_socket(struct ifobject *ifobject) +static int xsk_configure_socket(struct ifobject *ifobject, int idx) { struct xsk_socket_config cfg; + struct xsk_socket_info *xsk; struct xsk_ring_cons *rxr; struct xsk_ring_prod *txr; int ret; - ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info)); - if (!ifobject->xsk) + xsk = calloc(1, sizeof(struct xsk_socket_info)); + if (!xsk) exit_with_error(errno); - ifobject->xsk->umem = ifobject->umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + xsk->umem = ifobject->umem; + cfg.rx_size = rxqsize; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; - cfg.xdp_flags = opt_xdp_flags; - cfg.bind_flags = opt_xdp_bind_flags; + cfg.xdp_flags = xdp_flags; + cfg.bind_flags = xdp_bind_flags; - if (!opt_bidi) { - rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; - txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; + if (test_type != TEST_TYPE_BIDI) { + rxr = (ifobject->fv.vector == rx) ? &xsk->rx : NULL; + txr = (ifobject->fv.vector == tx) ? &xsk->tx : NULL; } else { - rxr = &ifobject->xsk->rx; - txr = &ifobject->xsk->tx; + rxr = &xsk->rx; + txr = &xsk->tx; } - ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname, - opt_queue, ifobject->umem->umem, rxr, txr, &cfg); - + ret = xsk_socket__create(&xsk->xsk, ifobject->ifname, idx, + ifobject->umem->umem, rxr, txr, &cfg); if (ret) return 1; + ifobject->xsk_arr[idx] = xsk; + return 0; } static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"queue", optional_argument, 0, 'q'}, - {"poll", no_argument, 0, 'p'}, - {"xdp-skb", no_argument, 0, 'S'}, - {"xdp-native", no_argument, 0, 'N'}, - {"copy", no_argument, 0, 'c'}, - {"tear-down", no_argument, 0, 'T'}, - {"bidi", optional_argument, 0, 'B'}, - {"debug", optional_argument, 0, 'D'}, + {"dump-pkts", optional_argument, 0, 'D'}, + {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; @@ -352,23 +344,21 @@ static void usage(const char *prog) " Options:\n" " -i, --interface Use interface\n" " -q, --queue=n Use queue n (default 0)\n" - " -p, --poll Use poll syscall\n" - " -S, --xdp-skb=n Use XDP SKB mode\n" - " -N, --xdp-native=n Enforce XDP DRV (native) mode\n" - " -c, --copy Force copy mode\n" - " -T, --tear-down Tear down sockets by repeatedly recreating them\n" - " -B, --bidi Bi-directional sockets test\n" - " -D, --debug Debug mode - dump packets L2 - L5\n" + " -D, --dump-pkts Dump packets L2 - L5\n" + " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; ksft_print_msg(str, prog); } -static bool switch_namespace(int idx) +static int switch_namespace(const char *nsname) { char fqns[26] = "/var/run/netns/"; int nsfd; - strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1); + if (!nsname || strlen(nsname) == 0) + return -1; + + strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1); nsfd = open(fqns, O_RDONLY); if (nsfd == -1) @@ -377,26 +367,9 @@ static bool switch_namespace(int idx) if (setns(nsfd, 0) == -1) exit_with_error(errno); - return true; -} + print_verbose("NS switched: %s\n", nsname); -static void *nsswitchthread(void *args) -{ - struct targs *targs = args; - - targs->retptr = false; - - if (switch_namespace(targs->idx)) { - ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname); - if (!ifdict[targs->idx]->ifindex) { - ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", - __func__, ifdict[targs->idx]->ifname); - } else { - ksft_print_msg("Interface found: %s\n", ifdict[targs->idx]->ifname); - targs->retptr = true; - } - } - pthread_exit(NULL); + return nsfd; } static int validate_interfaces(void) @@ -408,33 +381,6 @@ static int validate_interfaces(void) ret = false; ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>."); } - if (strcmp(ifdict[i]->nsname, "")) { - struct targs *targs; - - targs = malloc(sizeof(*targs)); - if (!targs) - exit_with_error(errno); - - targs->idx = i; - if (pthread_create(&ns_thread, NULL, nsswitchthread, targs)) - exit_with_error(errno); - - pthread_join(ns_thread, NULL); - - if (targs->retptr) - ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname); - - free(targs); - } else { - ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname); - if (!ifdict[i]->ifindex) { - ksft_test_result_fail - ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); - ret = false; - } else { - ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname); - } - } } return ret; } @@ -446,7 +392,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index); + c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index); if (c == -1) break; @@ -466,43 +412,26 @@ static void parse_command_line(int argc, char **argv) MAX_INTERFACES_NAMESPACE_CHARS); interface_index++; break; - case 'q': - opt_queue = atoi(optarg); - break; - case 'p': - opt_poll = 1; - break; - case 'S': - opt_xdp_flags |= XDP_FLAGS_SKB_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_SKB; - break; - case 'N': - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_DRV; - break; - case 'c': - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'T': - opt_teardown = 1; - break; - case 'B': - opt_bidi = 1; - break; case 'D': debug_pkt_dump = 1; break; case 'C': opt_pkt_count = atoi(optarg); break; + case 'v': + opt_verbose = 1; + break; default: usage(basename(argv[0])); ksft_exit_xfail(); } } + if (!opt_pkt_count) { + print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT); + opt_pkt_count = DEFAULT_PKT_CNT; + } + if (!validate_interfaces()) { usage(basename(argv[0])); ksft_exit_xfail(); @@ -519,7 +448,7 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) { unsigned int rcvd; u32 idx; @@ -527,7 +456,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) if (!xsk->outstanding_tx) return; - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); @@ -597,8 +526,10 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) { - u32 idx; + u32 idx = 0; unsigned int i; + bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; + u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) complete_tx_only(xsk, batch_size); @@ -607,17 +538,21 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = PKT_SIZE; + tx_desc->len = len; } xsk_ring_prod__submit(&xsk->tx, batch_size); - xsk->outstanding_tx += batch_size; + if (!tx_invalid_test) { + xsk->outstanding_tx += batch_size; + } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + kick_tx(xsk); + } *frameptr += batch_size; *frameptr %= num_frames; complete_tx_only(xsk, batch_size); } -static inline int get_batch_size(int pkt_cnt) +static int get_batch_size(int pkt_cnt) { if (!opt_pkt_count) return BATCH_SIZE; @@ -654,7 +589,7 @@ static void tx_only_all(struct ifobject *ifobject) while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -673,48 +608,43 @@ static void tx_only_all(struct ifobject *ifobject) static void worker_pkt_dump(void) { - struct in_addr ipaddr; + struct ethhdr *ethhdr; + struct iphdr *iphdr; + struct udphdr *udphdr; + char s[128]; + int payload; + void *ptr; fprintf(stdout, "---------------------------------------\n"); for (int iter = 0; iter < num_frames - 1; iter++) { + ptr = pkt_buf[iter]->payload; + ethhdr = ptr; + iphdr = ptr + sizeof(*ethhdr); + udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr); + /*extract L2 frame */ fprintf(stdout, "DEBUG>> L2: dst mac: "); for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ((struct ethhdr *) - pkt_buf[iter]->payload)->h_dest[i]); + fprintf(stdout, "%02X", ethhdr->h_dest[i]); fprintf(stdout, "\nDEBUG>> L2: src mac: "); for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ((struct ethhdr *) - pkt_buf[iter]->payload)->h_source[i]); + fprintf(stdout, "%02X", ethhdr->h_source[i]); /*extract L3 frame */ - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl); - - ipaddr.s_addr = - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr; - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr)); - - ipaddr.s_addr = - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr; - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr)); - + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); /*extract L4 frame */ - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + - sizeof(struct ethhdr) + - sizeof(struct iphdr)))->source)); - - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + - sizeof(struct ethhdr) + - sizeof(struct iphdr)))->dest)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); /*extract L5 frame */ - int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); + payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); if (payload == EOT) { - ksft_print_msg("End-of-transmission frame received\n"); + print_verbose("End-of-transmission frame received\n"); fprintf(stdout, "---------------------------------------\n"); break; } @@ -723,6 +653,48 @@ static void worker_pkt_dump(void) } } +static void worker_stats_validate(struct ifobject *ifobject) +{ + struct xdp_statistics stats; + socklen_t optlen; + int err; + struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ? + ifdict[!ifobject->ifdict_index]->xsk->xsk : + ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + unsigned long xsk_stat = 0, expected_stat = opt_pkt_count; + + sigvar = 0; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) + return; + + if (optlen == sizeof(struct xdp_statistics)) { + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + xsk_stat = stats.rx_dropped; + break; + case STAT_TEST_TX_INVALID: + xsk_stat = stats.tx_invalid_descs; + break; + case STAT_TEST_RX_FULL: + xsk_stat = stats.rx_ring_full; + expected_stat -= RX_FULL_RXQSIZE; + break; + case STAT_TEST_RX_FILL_EMPTY: + xsk_stat = stats.rx_fill_ring_empty_descs; + break; + default: + break; + } + + if (xsk_stat == expected_stat) + sigvar = 1; + } +} + static void worker_pkt_validate(void) { u32 payloadseqnum = -2; @@ -746,7 +718,7 @@ static void worker_pkt_validate(void) } if (payloadseqnum == EOT) { - ksft_print_msg("End-of-transmission frame received: PASS\n"); + print_verbose("End-of-transmission frame received: PASS\n"); sigvar = 1; break; } @@ -773,37 +745,69 @@ static void worker_pkt_validate(void) } } -static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr, - atomic_int *spinningptr) +static void thread_common_ops(struct ifobject *ifobject, void *bufs) { + int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; int ctr = 0; int ret; - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket(ifobject); + ifobject->ns_fd = switch_namespace(ifobject->nsname); + + if (test_type == TEST_TYPE_BPF_RES) + umem_sz *= 2; + + bufs = mmap(NULL, umem_sz, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + xsk_configure_umem(ifobject, bufs, 0); + ifobject->umem = ifobject->umem_arr[0]; + ret = xsk_configure_socket(ifobject, 0); /* Retry Create Socket if it fails as xsk_socket__create() * is asynchronous - * - * Essential to lock Mutex here to prevent Tx thread from - * entering before Rx and causing a deadlock */ - pthread_mutex_lock(mutexptr); while (ret && ctr < SOCK_RECONF_CTR) { - atomic_store(spinningptr, 1); - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket(ifobject); + xsk_configure_umem(ifobject, bufs, 0); + ifobject->umem = ifobject->umem_arr[0]; + ret = xsk_configure_socket(ifobject, 0); usleep(USLEEP_MAX); ctr++; } - atomic_store(spinningptr, 0); - pthread_mutex_unlock(mutexptr); if (ctr >= SOCK_RECONF_CTR) exit_with_error(ret); + + ifobject->umem = ifobject->umem_arr[0]; + ifobject->xsk = ifobject->xsk_arr[0]; + + if (test_type == TEST_TYPE_BPF_RES) { + xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1); + ifobject->umem = ifobject->umem_arr[1]; + ret = xsk_configure_socket(ifobject, 1); + } + + ifobject->umem = ifobject->umem_arr[0]; + ifobject->xsk = ifobject->xsk_arr[0]; + print_verbose("Interface [%s] vector [%s]\n", + ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx"); +} + +static bool testapp_is_test_two_stepped(void) +{ + return (test_type != TEST_TYPE_BIDI && test_type != TEST_TYPE_BPF_RES) || second_step; +} + +static void testapp_cleanup_xsk_res(struct ifobject *ifobj) +{ + if (testapp_is_test_two_stepped()) { + xsk_socket__delete(ifobj->xsk->xsk); + (void)xsk_umem__delete(ifobj->umem->umem); + } } -static void *worker_testapp_validate(void *arg) +static void *worker_testapp_validate_tx(void *arg) { struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); @@ -813,149 +817,97 @@ static void *worker_testapp_validate(void *arg) struct generic_data data; void *bufs = NULL; - pthread_attr_setstacksize(&attr, THREAD_STACK); - - if (!bidi_pass) { - bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (bufs == MAP_FAILED) - exit_with_error(errno); - - if (strcmp(ifobject->nsname, "")) - switch_namespace(ifobject->ifdict_index); + if (!second_step) + thread_common_ops(ifobject, bufs); + + for (int i = 0; i < num_frames; i++) { + /*send EOT frame */ + if (i == (num_frames - 1)) + data.seqnum = -1; + else + data.seqnum = i; + gen_udp_hdr(&data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr(ifobject, eth_hdr); + gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - if (ifobject->fv.vector == tx) { - int spinningrxctr = 0; - - if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); - - while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { - spinningrxctr++; - usleep(USLEEP_MAX); - } + print_verbose("Sending %d packets on interface %s\n", + (opt_pkt_count - 1), ifobject->ifname); + tx_only_all(ifobject); - ksft_print_msg("Interface [%s] vector [Tx]\n", ifobject->ifname); - for (int i = 0; i < num_frames; i++) { - /*send EOT frame */ - if (i == (num_frames - 1)) - data.seqnum = -1; - else - data.seqnum = i; - gen_udp_hdr(&data, ifobject, udp_hdr); - gen_ip_hdr(ifobject, ip_hdr); - gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr(ifobject, eth_hdr); - gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); - } + testapp_cleanup_xsk_res(ifobject); + pthread_exit(NULL); +} - ksft_print_msg("Sending %d packets on interface %s\n", - (opt_pkt_count - 1), ifobject->ifname); - tx_only_all(ifobject); - } else if (ifobject->fv.vector == rx) { - struct pollfd fds[MAX_SOCKS] = { }; - int ret; +static void *worker_testapp_validate_rx(void *arg) +{ + struct ifobject *ifobject = (struct ifobject *)arg; + struct pollfd fds[MAX_SOCKS] = { }; + void *bufs = NULL; - if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); + if (!second_step) + thread_common_ops(ifobject, bufs); - ksft_print_msg("Interface [%s] vector [Rx]\n", ifobject->ifname); + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) xsk_populate_fill_ring(ifobject->umem); - TAILQ_INIT(&head); - if (debug_pkt_dump) { - pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); - if (!pkt_buf) - exit_with_error(errno); - } + TAILQ_INIT(&head); + if (debug_pkt_dump) { + pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); + if (!pkt_buf) + exit_with_error(errno); + } - fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); - fds[0].events = POLLIN; + fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); + fds[0].events = POLLIN; - pthread_mutex_lock(&sync_mutex); - pthread_cond_signal(&signal_rx_condition); - pthread_mutex_unlock(&sync_mutex); + pthread_barrier_wait(&barr); - while (1) { - if (opt_poll) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret <= 0) - continue; - } + while (1) { + if (test_type != TEST_TYPE_STATS) { rx_pkt(ifobject->xsk, fds); worker_pkt_validate(); - - if (sigvar) - break; + } else { + worker_stats_validate(ifobject); } + if (sigvar) + break; + } - ksft_print_msg("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + print_verbose("Received %d packets on interface %s\n", + pkt_counter, ifobject->ifname); - if (opt_teardown) - ksft_print_msg("Destroying socket\n"); - } + if (test_type == TEST_TYPE_TEARDOWN) + print_verbose("Destroying socket\n"); - if (!opt_bidi || bidi_pass) { - xsk_socket__delete(ifobject->xsk->xsk); - (void)xsk_umem__delete(ifobject->umem->umem); - } + testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); } static void testapp_validate(void) { - struct timespec max_wait = { 0, 0 }; + bool bidi = test_type == TEST_TYPE_BIDI; + bool bpf = test_type == TEST_TYPE_BPF_RES; - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, THREAD_STACK); - - if (opt_bidi && bidi_pass) { - pthread_init_mutex(); - if (!switching_notify) { - ksft_print_msg("Switching Tx/Rx vectors\n"); - switching_notify++; - } - } - - pthread_mutex_lock(&sync_mutex); + if (pthread_barrier_init(&barr, NULL, 2)) + exit_with_error(errno); /*Spawn RX thread */ - if (!opt_bidi || !bidi_pass) { - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) - exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { - /*switch Tx/Rx vectors */ - ifdict[0]->fv.vector = rx; - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) - exit_with_error(errno); - } + pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); - if (clock_gettime(CLOCK_REALTIME, &max_wait)) + pthread_barrier_wait(&barr); + if (pthread_barrier_destroy(&barr)) exit_with_error(errno); - max_wait.tv_sec += TMOUT_SEC; - - if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT) - exit_with_error(errno); - - pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!opt_bidi || !bidi_pass) { - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) - exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { - /*switch Tx/Rx vectors */ - ifdict[1]->fv.vector = tx; - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) - exit_with_error(errno); - } + pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx); pthread_join(t1, NULL); pthread_join(t0, NULL); - if (debug_pkt_dump) { + if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { worker_pkt_dump(); for (int iter = 0; iter < num_frames - 1; iter++) { free(pkt_buf[iter]->payload); @@ -964,73 +916,217 @@ static void testapp_validate(void) free(pkt_buf); } - if (!opt_teardown && !opt_bidi) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } -static void testapp_sockets(void) +static void testapp_teardown(void) +{ + int i; + + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + print_verbose("Creating socket\n"); + testapp_validate(); + } + + print_ksft_result(); +} + +static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2) { - for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) { + void *(*tmp_func_ptr)(void *) = ifobj1->func_ptr; + enum fvector tmp_vector = ifobj1->fv.vector; + + ifobj1->func_ptr = ifobj2->func_ptr; + ifobj1->fv.vector = ifobj2->fv.vector; + + ifobj2->func_ptr = tmp_func_ptr; + ifobj2->fv.vector = tmp_vector; + + ifdict_tx = ifobj1; + ifdict_rx = ifobj2; +} + +static void testapp_bidi(void) +{ + for (int i = 0; i < MAX_BIDI_ITER; i++) { pkt_counter = 0; prev_pkt = -1; sigvar = 0; - ksft_print_msg("Creating socket\n"); + print_verbose("Creating socket\n"); testapp_validate(); - opt_bidi ? bidi_pass++ : bidi_pass; + if (!second_step) { + print_verbose("Switching Tx/Rx vectors\n"); + swap_vectors(ifdict[1], ifdict[0]); + } + second_step = true; } + swap_vectors(ifdict[0], ifdict[1]); + print_ksft_result(); } -static void init_iface_config(struct ifaceconfigobj *ifaceconfig) +static void swap_xsk_res(void) { - /*Init interface0 */ - ifdict[0]->fv.vector = tx; - memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN); - memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN); - ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr; - ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr; - ifdict[0]->dst_port = ifaceconfig->dst_port; - ifdict[0]->src_port = ifaceconfig->src_port; - - /*Init interface1 */ - ifdict[1]->fv.vector = rx; - memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN); - memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN); - ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr; - ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr; - ifdict[1]->dst_port = ifaceconfig->src_port; - ifdict[1]->src_port = ifaceconfig->dst_port; + xsk_socket__delete(ifdict_tx->xsk->xsk); + xsk_umem__delete(ifdict_tx->umem->umem); + xsk_socket__delete(ifdict_rx->xsk->xsk); + xsk_umem__delete(ifdict_rx->umem->umem); + ifdict_tx->umem = ifdict_tx->umem_arr[1]; + ifdict_tx->xsk = ifdict_tx->xsk_arr[1]; + ifdict_rx->umem = ifdict_rx->umem_arr[1]; + ifdict_rx->xsk = ifdict_rx->xsk_arr[1]; +} + +static void testapp_bpf_res(void) +{ + int i; + + for (i = 0; i < MAX_BPF_ITER; i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + print_verbose("Creating socket\n"); + testapp_validate(); + if (!second_step) + swap_xsk_res(); + second_step = true; + } + + print_ksft_result(); +} + +static void testapp_stats(void) +{ + for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) { + stat_test_type = i; + + /* reset defaults */ + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + frame_headroom = XSK_UMEM__DEFAULT_FRAME_SIZE - + XDP_PACKET_HEADROOM - 1; + break; + case STAT_TEST_RX_FULL: + rxqsize = RX_FULL_RXQSIZE; + break; + default: + break; + } + testapp_validate(); + } + + print_ksft_result(); +} + +static void init_iface(struct ifobject *ifobj, const char *dst_mac, + const char *src_mac, const char *dst_ip, + const char *src_ip, const u16 dst_port, + const u16 src_port, enum fvector vector) +{ + struct in_addr ip; + + memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); + memcpy(ifobj->src_mac, src_mac, ETH_ALEN); + + inet_aton(dst_ip, &ip); + ifobj->dst_ip = ip.s_addr; + + inet_aton(src_ip, &ip); + ifobj->src_ip = ip.s_addr; + + ifobj->dst_port = dst_port; + ifobj->src_port = src_port; + + if (vector == tx) { + ifobj->fv.vector = tx; + ifobj->func_ptr = worker_testapp_validate_tx; + ifdict_tx = ifobj; + } else { + ifobj->fv.vector = rx; + ifobj->func_ptr = worker_testapp_validate_rx; + ifdict_rx = ifobj; + } +} + +static void run_pkt_test(int mode, int type) +{ + test_type = type; + + /* reset defaults after potential previous test */ + xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + pkt_counter = 0; + second_step = 0; + prev_pkt = -1; + sigvar = 0; + stat_test_type = -1; + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + + configured_mode = mode; + + switch (mode) { + case (TEST_MODE_SKB): + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case (TEST_MODE_DRV): + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + default: + break; + } + + switch (test_type) { + case TEST_TYPE_STATS: + testapp_stats(); + break; + case TEST_TYPE_TEARDOWN: + testapp_teardown(); + break; + case TEST_TYPE_BIDI: + testapp_bidi(); + break; + case TEST_TYPE_BPF_RES: + testapp_bpf_res(); + break; + default: + testapp_validate(); + break; + } } int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; + bool failure = false; + int i, j; if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) exit_with_error(errno); - const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; - const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; - const char *IP1 = "192.168.100.162"; - const char *IP2 = "192.168.100.161"; - u16 UDP_DST_PORT = 2020; - u16 UDP_SRC_PORT = 2121; - - ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); - memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); - memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); - inet_aton(IP1, &ifaceconfig->dst_ip); - inet_aton(IP2, &ifaceconfig->src_ip); - ifaceconfig->dst_port = UDP_DST_PORT; - ifaceconfig->src_port = UDP_SRC_PORT; - for (int i = 0; i < MAX_INTERFACES; i++) { ifdict[i] = malloc(sizeof(struct ifobject)); if (!ifdict[i]) exit_with_error(errno); ifdict[i]->ifdict_index = i; + ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); + if (!ifdict[i]->xsk_arr) { + failure = true; + goto cleanup; + } + ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); + if (!ifdict[i]->umem_arr) { + failure = true; + goto cleanup; + } } setlocale(LC_ALL, ""); @@ -1039,25 +1135,27 @@ int main(int argc, char **argv) num_frames = ++opt_pkt_count; - init_iface_config(ifaceconfig); - - pthread_init_mutex(); + init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); + init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); - ksft_set_plan(1); + ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - if (!opt_teardown && !opt_bidi) { - testapp_validate(); - } else if (opt_teardown && opt_bidi) { - ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n"); - ksft_exit_xfail(); - } else { - testapp_sockets(); + for (i = 0; i < TEST_MODE_MAX; i++) { + for (j = 0; j < TEST_TYPE_MAX; j++) + run_pkt_test(i, j); } - for (int i = 0; i < MAX_INTERFACES; i++) +cleanup: + for (int i = 0; i < MAX_INTERFACES; i++) { + if (ifdict[i]->ns_fd != -1) + close(ifdict[i]->ns_fd); + free(ifdict[i]->xsk_arr); + free(ifdict[i]->umem_arr); free(ifdict[i]); + } - pthread_destroy_mutex(); + if (failure) + exit_with_error(errno); ksft_exit_pass(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 0e9f9b7e61c2..6c428b276ab6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -23,6 +23,7 @@ #define MAX_SOCKS 1 #define MAX_TEARDOWN_ITER 10 #define MAX_BIDI_ITER 2 +#define MAX_BPF_ITER 2 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define MIN_PKT_SIZE 64 @@ -33,41 +34,63 @@ #define IP_PKT_TOS 0x9 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) -#define TMOUT_SEC (3) #define EOT (-1) #define USLEEP_MAX 200000 -#define THREAD_STACK 60000000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 -#define NEED_WAKEUP true +#define DEFAULT_PKT_CNT 10000 +#define RX_FULL_RXQSIZE 32 + +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; -enum TESTS { - ORDER_CONTENT_VALIDATE_XDP_SKB = 0, - ORDER_CONTENT_VALIDATE_XDP_DRV = 1, +enum TEST_MODES { + TEST_MODE_UNCONFIGURED = -1, + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_MAX +}; + +enum TEST_TYPES { + TEST_TYPE_NOPOLL, + TEST_TYPE_POLL, + TEST_TYPE_TEARDOWN, + TEST_TYPE_BIDI, + TEST_TYPE_STATS, + TEST_TYPE_BPF_RES, + TEST_TYPE_MAX +}; + +enum STAT_TEST_TYPES { + STAT_TEST_RX_DROPPED, + STAT_TEST_TX_INVALID, + STAT_TEST_RX_FULL, + STAT_TEST_RX_FILL_EMPTY, + STAT_TEST_TYPE_MAX }; -u8 uut; -u8 debug_pkt_dump; -u32 num_frames; -u8 switching_notify; -u8 bidi_pass; +static int configured_mode = TEST_MODE_UNCONFIGURED; +static u8 debug_pkt_dump; +static u32 num_frames; +static bool second_step; +static int test_type; -static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int opt_queue; static int opt_pkt_count; -static int opt_poll; -static int opt_teardown; -static int opt_bidi; -static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u8 opt_verbose; + +static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; -static u32 prev_pkt = -1; +static long prev_pkt = -1; static int sigvar; +static int stat_test_type; +static u32 rxqsize; +static u32 frame_headroom; struct xsk_umem_info { struct xsk_ring_prod fq; @@ -99,47 +122,32 @@ struct generic_data { u32 seqnum; }; -struct ifaceconfigobj { - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; - struct in_addr dst_ip; - struct in_addr src_ip; - u16 src_port; - u16 dst_port; -} *ifaceconfig; - struct ifobject { - int ifindex; - int ifdict_index; char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; - struct flow_vector fv; struct xsk_socket_info *xsk; + struct xsk_socket_info **xsk_arr; + struct xsk_umem_info **umem_arr; struct xsk_umem_info *umem; - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; + void *(*func_ptr)(void *arg); + struct flow_vector fv; + int ns_fd; + int ifdict_index; u32 dst_ip; u32 src_ip; u16 src_port; u16 dst_port; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; }; static struct ifobject *ifdict[MAX_INTERFACES]; +static struct ifobject *ifdict_rx; +static struct ifobject *ifdict_tx; /*threads*/ -atomic_int spinning_tx; -atomic_int spinning_rx; -pthread_mutex_t sync_mutex; -pthread_mutex_t sync_mutex_tx; -pthread_cond_t signal_rx_condition; -pthread_cond_t signal_tx_condition; -pthread_t t0, t1, ns_thread; -pthread_attr_t attr; - -struct targs { - bool retptr; - int idx; -}; +pthread_barrier_t barr; +pthread_t t0, t1; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); struct head_s *head_p; diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 9d54c4645127..dac1c5f78752 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -82,24 +82,21 @@ clear_configs() { if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; } - echo "removing ns $3" + { ip netns exec $3 ip link del $2; } ip netns del $3 fi #Once we delete a veth pair node, the entire veth pair is removed, #this is just to be cautious just incase the NS does not exist then #veth node inside NS won't get removed so we explicitly remove it [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1"; ip link del $1; } + { ip link del $1; } if [ -f ${SPECFILE} ]; then - echo "removing spec file:" ${SPECFILE} rm -f ${SPECFILE} fi } cleanup_exit() { - echo "cleaning up..." clear_configs $1 $2 $3 } @@ -108,28 +105,7 @@ validate_ip_utility() [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; } } -vethXDPgeneric() -{ - ip link set dev $1 xdpdrv off - ip netns exec $3 ip link set dev $2 xdpdrv off -} - -vethXDPnative() -{ - ip link set dev $1 xdpgeneric off - ip netns exec $3 ip link set dev $2 xdpgeneric off -} - execxdpxceiver() { - local -a 'paramkeys=("${!'"$1"'[@]}")' copy - paramkeysstr=${paramkeys[*]} - - for index in $paramkeysstr; - do - current=$1"[$index]" - copy[$index]=${!current} - done - - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} } diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 1fedfc9da434..42d44e27802c 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -446,6 +446,35 @@ __invalid_nexthop_test() log_test "Unresolved neigh: nexthop does not exist: $desc" } +__invalid_nexthop_bucket_test() +{ + local desc=$1; shift + local dip=$1; shift + local via_add=$1; shift + local trap_name="unresolved_neigh" + + RET=0 + + # Check that route to nexthop that does not exist triggers + # unresolved_neigh + ip nexthop add id 1 via $via_add dev $rp2 + ip nexthop add id 10 group 1 type resilient buckets 32 + ip route add $dip nhid 10 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + ping_do $h1 $dip + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t0_packets -eq $t1_packets ]]; then + check_err 1 "Trap counter did not increase" + fi + + ip route del $dip nhid 10 + ip nexthop del id 10 + ip nexthop del id 1 + log_test "Unresolved neigh: nexthop bucket does not exist: $desc" +} + unresolved_neigh_test() { __host_miss_test "IPv4" 198.51.100.1 @@ -453,6 +482,8 @@ unresolved_neigh_test() __invalid_nexthop_test "IPv4" 198.51.100.1 198.51.100.3 24 198.51.100.4 __invalid_nexthop_test "IPv6" 2001:db8:2::1 2001:db8:2::3 64 \ 2001:db8:2::4 + __invalid_nexthop_bucket_test "IPv4" 198.51.100.1 198.51.100.4 + __invalid_nexthop_bucket_test "IPv6" 2001:db8:2::1 2001:db8:2::4 } vrf_without_routes_create() diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh index 6f3a70df63bc..e00435753008 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh @@ -120,12 +120,13 @@ __mirror_gre_test() sleep 5 for ((i = 0; i < count; ++i)); do + local sip=$(mirror_gre_ipv6_addr 1 $i)::1 local dip=$(mirror_gre_ipv6_addr 1 $i)::2 local htun=h3-gt6-$i local message icmp6_capture_install $htun - mirror_test v$h1 "" $dip $htun 100 10 + mirror_test v$h1 $sip $dip $htun 100 10 icmp6_capture_uninstall $htun done } diff --git a/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh index f813ffefc07e..65f43a7ce9c9 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh @@ -55,10 +55,6 @@ port_test() | jq '.[][][] | select(.name=="physical_ports") |.["occ"]') [[ $occ -eq $max_ports ]] - if [[ $should_fail -eq 0 ]]; then - check_err $? "Mismatch ports number: Expected $max_ports, got $occ." - else - check_err_fail $should_fail $? "Reached more ports than expected" - fi + check_err_fail $should_fail $? "Attempt to create $max_ports ports (actual result $occ)" } diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh index ed346da5d3cb..a217f9f6775b 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh @@ -33,6 +33,7 @@ ALL_TESTS=" nexthop_obj_invalid_test nexthop_obj_offload_test nexthop_obj_group_offload_test + nexthop_obj_bucket_offload_test nexthop_obj_blackhole_offload_test nexthop_obj_route_offload_test devlink_reload_test @@ -739,11 +740,28 @@ nexthop_obj_invalid_test() ip nexthop add id 1 dev $swp1 ip nexthop add id 2 dev $swp1 + ip nexthop add id 3 via 192.0.2.3 dev $swp1 ip nexthop add id 10 group 1/2 check_fail $? "managed to configure a nexthop group with device-only nexthops when should not" + ip nexthop add id 10 group 3 type resilient buckets 7 + check_fail $? "managed to configure a too small resilient nexthop group when should not" + + ip nexthop add id 10 group 3 type resilient buckets 129 + check_fail $? "managed to configure a resilient nexthop group with invalid number of buckets when should not" + + ip nexthop add id 10 group 1/2 type resilient buckets 32 + check_fail $? "managed to configure a resilient nexthop group with device-only nexthops when should not" + + ip nexthop add id 10 group 3 type resilient buckets 32 + check_err $? "failed to configure a valid resilient nexthop group" + ip nexthop replace id 3 dev $swp1 + check_fail $? "managed to populate a nexthop bucket with a device-only nexthop when should not" + log_test "nexthop objects - invalid configurations" + ip nexthop del id 10 + ip nexthop del id 3 ip nexthop del id 2 ip nexthop del id 1 @@ -858,6 +876,70 @@ nexthop_obj_group_offload_test() simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 } +nexthop_obj_bucket_offload_test() +{ + # Test offload indication of nexthop buckets + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 + setup_wait + + ip nexthop add id 1 via 192.0.2.2 dev $swp1 + ip nexthop add id 2 via 2001:db8:1::2 dev $swp1 + ip nexthop add id 10 group 1/2 type resilient buckets 32 idle_timer 0 + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 192.0.2.3 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 1 + check_err $? "IPv4 nexthop buckets not marked as offloaded when should" + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 2 + check_err $? "IPv6 nexthop buckets not marked as offloaded when should" + + # Invalidate nexthop id 1 + ip neigh replace 192.0.2.2 nud failed dev $swp1 + busywait "$TIMEOUT" wait_for_trap \ + ip nexthop bucket show nhid 1 + check_err $? "IPv4 nexthop buckets not marked with trap when should" + + # Invalidate nexthop id 2 + ip neigh replace 2001:db8:1::2 nud failed dev $swp1 + busywait "$TIMEOUT" wait_for_trap \ + ip nexthop bucket show nhid 2 + check_err $? "IPv6 nexthop buckets not marked with trap when should" + + # Revalidate nexthop id 1 by changing its configuration + ip nexthop replace id 1 via 192.0.2.3 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 1 + check_err $? "nexthop bucket not marked as offloaded after revalidating nexthop" + + # Revalidate nexthop id 2 by changing its neighbour + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 2 + check_err $? "nexthop bucket not marked as offloaded after revalidating neighbour" + + log_test "nexthop bucket offload indication" + + ip neigh del 2001:db8:1::2 dev $swp1 + ip neigh del 192.0.2.3 dev $swp1 + ip neigh del 192.0.2.2 dev $swp1 + ip nexthop del id 10 + ip nexthop del id 2 + ip nexthop del id 1 + + simple_if_fini $swp2 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + nexthop_obj_blackhole_offload_test() { # Test offload indication of blackhole nexthop objects diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh index b0cb1aaffdda..33ddd01689be 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh @@ -507,8 +507,8 @@ do_red_test() check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." local diff=$((limit - backlog)) pct=$((100 * diff / limit)) - ((0 <= pct && pct <= 5)) - check_err $? "backlog $backlog / $limit expected <= 5% distance" + ((0 <= pct && pct <= 10)) + check_err $? "backlog $backlog / $limit expected <= 10% distance" log_test "TC $((vlan - 10)): RED backlog > limit" stop_traffic diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh index 3f007c5f8361..f3ef3274f9b3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh @@ -67,6 +67,13 @@ red_test() { install_qdisc + # Make sure that we get the non-zero value if there is any. + local cur=$(busywait 1100 until_counter_is "> 0" \ + qdisc_stats_get $swp3 10: .backlog) + (( cur == 0 )) + check_err $? "backlog of $cur observed on non-busy qdisc" + log_test "$QDISC backlog properly cleaned" + do_red_test 10 $BACKLOG1 do_red_test 11 $BACKLOG2 diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh deleted file mode 100755 index 0231205a7147..000000000000 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -lib_dir=$(dirname $0)/../../../../net/forwarding - -VXPORT=4789 - -ALL_TESTS=" - create_dot1d_and_dot1ad_vxlans -" -NUM_NETIFS=2 -source $lib_dir/lib.sh - -setup_prepare() -{ - swp1=${NETIFS[p1]} - swp2=${NETIFS[p2]} - - ip link set dev $swp1 up - ip link set dev $swp2 up -} - -cleanup() -{ - pre_cleanup - - ip link set dev $swp2 down - ip link set dev $swp1 down -} - -create_dot1d_and_dot1ad_vxlans() -{ - RET=0 - - ip link add dev br0 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ - vlan_default_pvid 0 mcast_snooping 0 - ip link set dev br0 up - - ip link add name vx100 type vxlan id 1000 local 192.0.2.17 dstport \ - "$VXPORT" nolearning noudpcsum tos inherit ttl 100 - ip link set dev vx100 up - - ip link set dev $swp1 master br0 - ip link set dev vx100 master br0 - bridge vlan add vid 100 dev vx100 pvid untagged - - ip link add dev br1 type bridge vlan_filtering 0 mcast_snooping 0 - ip link set dev br1 up - - ip link add name vx200 type vxlan id 2000 local 192.0.2.17 dstport \ - "$VXPORT" nolearning noudpcsum tos inherit ttl 100 - ip link set dev vx200 up - - ip link set dev $swp2 master br1 - ip link set dev vx200 master br1 2>/dev/null - check_fail $? "802.1d and 802.1ad VxLANs at the same time not rejected" - - ip link set dev vx200 master br1 2>&1 >/dev/null \ - | grep -q mlxsw_spectrum - check_err $? "802.1d and 802.1ad VxLANs at the same time rejected without extack" - - log_test "create 802.1d and 802.1ad VxLANs" - - ip link del dev vx200 - ip link del dev br1 - ip link del dev vx100 - ip link del dev br0 -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 4a1c9328555f..50654f8a8c37 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -30,6 +30,7 @@ trap cleanup EXIT ALL_TESTS="router tc_flower mirror_gre tc_police port" for current_test in ${TESTS:-$ALL_TESTS}; do + RET_FIN=0 source ${current_test}_scale.sh num_netifs_var=${current_test^^}_NUM_NETIFS @@ -48,8 +49,9 @@ for current_test in ${TESTS:-$ALL_TESTS}; do else log_test "'$current_test' overflow $target" fi + RET_FIN=$(( RET_FIN || RET )) done done current_test="" -exit "$RET" +exit "$RET_FIN" diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index 087a884f66cd..685dfb3478b3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -24,6 +24,7 @@ trap cleanup EXIT ALL_TESTS="router tc_flower mirror_gre tc_police port" for current_test in ${TESTS:-$ALL_TESTS}; do + RET_FIN=0 source ${current_test}_scale.sh num_netifs_var=${current_test^^}_NUM_NETIFS @@ -50,8 +51,9 @@ for current_test in ${TESTS:-$ALL_TESTS}; do log_test "'$current_test' [$profile] overflow $target" fi done + RET_FIN=$(( RET_FIN || RET )) done done current_test="" -exit "$RET" +exit "$RET_FIN" diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh index cc0f07e72cf2..aa74be9f47c8 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh @@ -98,11 +98,7 @@ __tc_flower_test() jq -r '[ .[] | select(.kind == "flower") | .options | .in_hw ]' | jq .[] | wc -l) [[ $((offload_count - 1)) -eq $count ]] - if [[ $should_fail -eq 0 ]]; then - check_err $? "Offload mismatch" - else - check_err_fail $should_fail $? "Offload more than expacted" - fi + check_err_fail $should_fail $? "Attempt to offload $count rules (actual result $((offload_count - 1)))" } tc_flower_test() diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 553cb9fad508..5ec3beb637c8 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -11,6 +11,7 @@ ALL_TESTS=" matchall_mirror_behind_flower_ingress_test matchall_sample_behind_flower_ingress_test matchall_mirror_behind_flower_egress_test + matchall_proto_match_test police_limits_test multi_police_test " @@ -18,6 +19,7 @@ NUM_NETIFS=2 source $lib_dir/tc_common.sh source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh switch_create() { @@ -166,7 +168,8 @@ matchall_sample_egress_test() RET=0 # It is forbidden in mlxsw driver to have matchall with sample action - # bound on egress + # bound on egress. Spectrum-1 specific restriction + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return tc qdisc add dev $swp1 clsact @@ -289,6 +292,22 @@ matchall_mirror_behind_flower_egress_test() matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2" } +matchall_proto_match_test() +{ + RET=0 + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ + matchall skip_sw \ + action sample group 1 rate 100 + check_fail $? "Incorrect success to add matchall rule with protocol match" + + tc qdisc del dev $swp1 clsact + + log_test "matchall protocol match" +} + police_limits_test() { RET=0 diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh new file mode 100755 index 000000000000..093bed088ad0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -0,0 +1,657 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test that packets are sampled when tc-sample is used and that reported +# metadata is correct. Two sets of hosts (with and without LAG) are used, since +# metadata extraction in mlxsw is a bit different when LAG is involved. +# +# +---------------------------------+ +---------------------------------+ +# | H1 (vrf) | | H3 (vrf) | +# | + $h1 | | + $h3_lag | +# | | 192.0.2.1/28 | | | 192.0.2.17/28 | +# | | | | | | +# | | default via 192.0.2.2 | | | default via 192.0.2.18 | +# +----|----------------------------+ +----|----------------------------+ +# | | +# +----|-----------------------------------------|----------------------------+ +# | | 192.0.2.2/28 | 192.0.2.18/28 | +# | + $rp1 + $rp3_lag | +# | | +# | + $rp2 + $rp4_lag | +# | | 198.51.100.2/28 | 198.51.100.18/28 | +# +----|-----------------------------------------|----------------------------+ +# | | +# +----|----------------------------+ +----|----------------------------+ +# | | default via 198.51.100.2 | | | default via 198.51.100.18 | +# | | | | | | +# | | 198.51.100.1/28 | | | 198.51.100.17/28 | +# | + $h2 | | + $h4_lag | +# | H2 (vrf) | | H4 (vrf) | +# +---------------------------------+ +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + tc_sample_rate_test + tc_sample_max_rate_test + tc_sample_conflict_test + tc_sample_group_conflict_test + tc_sample_md_iif_test + tc_sample_md_lag_iif_test + tc_sample_md_oif_test + tc_sample_md_lag_oif_test + tc_sample_md_out_tc_test + tc_sample_md_out_tc_occ_test + tc_sample_md_latency_test + tc_sample_acl_group_conflict_test + tc_sample_acl_rate_test + tc_sample_acl_max_rate_test +" +NUM_NETIFS=8 +CAPTURE_FILE=$(mktemp) +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +# Available at https://github.com/Mellanox/libpsample +require_command psample + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/28 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/28 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 +} + +h2_destroy() +{ + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/28 +} + +h3_create() +{ + ip link set dev $h3 down + ip link add name ${h3}_bond type bond mode 802.3ad + ip link set dev $h3 master ${h3}_bond + + simple_if_init ${h3}_bond 192.0.2.17/28 + + ip -4 route add default vrf v${h3}_bond nexthop via 192.0.2.18 +} + +h3_destroy() +{ + ip -4 route del default vrf v${h3}_bond nexthop via 192.0.2.18 + + simple_if_fini ${h3}_bond 192.0.2.17/28 + + ip link set dev $h3 nomaster + ip link del dev ${h3}_bond +} + +h4_create() +{ + ip link set dev $h4 down + ip link add name ${h4}_bond type bond mode 802.3ad + ip link set dev $h4 master ${h4}_bond + + simple_if_init ${h4}_bond 198.51.100.17/28 + + ip -4 route add default vrf v${h4}_bond nexthop via 198.51.100.18 +} + +h4_destroy() +{ + ip -4 route del default vrf v${h4}_bond nexthop via 198.51.100.18 + + simple_if_fini ${h4}_bond 198.51.100.17/28 + + ip link set dev $h4 nomaster + ip link del dev ${h4}_bond +} + +router_create() +{ + ip link set dev $rp1 up + __addr_add_del $rp1 add 192.0.2.2/28 + tc qdisc add dev $rp1 clsact + + ip link set dev $rp2 up + __addr_add_del $rp2 add 198.51.100.2/28 + tc qdisc add dev $rp2 clsact + + ip link add name ${rp3}_bond type bond mode 802.3ad + ip link set dev $rp3 master ${rp3}_bond + __addr_add_del ${rp3}_bond add 192.0.2.18/28 + tc qdisc add dev $rp3 clsact + ip link set dev ${rp3}_bond up + + ip link add name ${rp4}_bond type bond mode 802.3ad + ip link set dev $rp4 master ${rp4}_bond + __addr_add_del ${rp4}_bond add 198.51.100.18/28 + tc qdisc add dev $rp4 clsact + ip link set dev ${rp4}_bond up +} + +router_destroy() +{ + ip link set dev ${rp4}_bond down + tc qdisc del dev $rp4 clsact + __addr_add_del ${rp4}_bond del 198.51.100.18/28 + ip link set dev $rp4 nomaster + ip link del dev ${rp4}_bond + + ip link set dev ${rp3}_bond down + tc qdisc del dev $rp3 clsact + __addr_add_del ${rp3}_bond del 192.0.2.18/28 + ip link set dev $rp3 nomaster + ip link del dev ${rp3}_bond + + tc qdisc del dev $rp2 clsact + __addr_add_del $rp2 del 198.51.100.2/28 + ip link set dev $rp2 down + + tc qdisc del dev $rp1 clsact + __addr_add_del $rp1 del 192.0.2.2/28 + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + h3=${NETIFS[p5]} + rp3=${NETIFS[p6]} + h4=${NETIFS[p7]} + rp4=${NETIFS[p8]} + + vrf_prepare + + h1_create + h2_create + h3_create + h4_create + router_create +} + +cleanup() +{ + pre_cleanup + + rm -f $CAPTURE_FILE + + router_destroy + h4_destroy + h3_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +psample_capture_start() +{ + rm -f $CAPTURE_FILE + + psample &> $CAPTURE_FILE & + + sleep 1 +} + +psample_capture_stop() +{ + { kill %% && wait %%; } 2>/dev/null +} + +__tc_sample_rate_test() +{ + local desc=$1; shift + local dip=$1; shift + local pkts pct + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B $dip -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) + pct=$((100 * (pkts - 100) / 100)) + (( -25 <= pct && pct <= 25)) + check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + + log_test "tc sample rate ($desc)" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_rate_test() +{ + __tc_sample_rate_test "forward" 198.51.100.1 + __tc_sample_rate_test "local receive" 192.0.2.2 +} + +tc_sample_max_rate_test() +{ + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate $((35 * 10 ** 8)) group 1 + check_err $? "Failed to configure sampling rule with max rate" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate $((35 * 10 ** 8 + 1)) \ + group 1 &> /dev/null + check_fail $? "Managed to configure sampling rate above maximum" + + log_test "tc sample maximum rate" +} + +tc_sample_conflict_test() +{ + RET=0 + + # Test that two sampling rules cannot be configured on the same port, + # even when they share the same parameters. + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 1 &> /dev/null + check_fail $? "Managed to configure second sampling rule" + + # Delete the first rule and make sure the second rule can now be + # configured. + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule after deletion" + + log_test "tc sample conflict test" + + tc filter del dev $rp1 ingress protocol all pref 2 handle 102 matchall +} + +tc_sample_group_conflict_test() +{ + RET=0 + + # Test that two sampling rules cannot be configured on the same port + # with different groups. + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 2 &> /dev/null + check_fail $? "Managed to configure sampling rule with conflicting group" + + log_test "tc sample group conflict test" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_iif_test() +{ + local rp1_ifindex + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp1_ifindex=$(ip -j -p link show dev $rp1 | jq '.[]["ifindex"]') + grep -q -e "in-ifindex $rp1_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected in-ifindex" + + log_test "tc sample iif" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_lag_iif_test() +{ + local rp3_ifindex + + RET=0 + + tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \ + -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp3_ifindex=$(ip -j -p link show dev $rp3 | jq '.[]["ifindex"]') + grep -q -e "in-ifindex $rp3_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected in-ifindex" + + log_test "tc sample lag iif" + + tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_oif_test() +{ + local rp2_ifindex + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp2_ifindex=$(ip -j -p link show dev $rp2 | jq '.[]["ifindex"]') + grep -q -e "out-ifindex $rp2_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-ifindex" + + log_test "tc sample oif" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_lag_oif_test() +{ + local rp4_ifindex + + RET=0 + + tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \ + -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp4_ifindex=$(ip -j -p link show dev $rp4 | jq '.[]["ifindex"]') + grep -q -e "out-ifindex $rp4_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-ifindex" + + log_test "tc sample lag oif" + + tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_out_tc_test() +{ + RET=0 + + # Output traffic class is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + # By default, all the packets should go to the same traffic class (0). + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "out-tc 0 " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-tc (0)" + + # Map all priorities to highest traffic class (7) and check reported + # out-tc. + tc qdisc replace dev $rp2 root handle 1: \ + prio bands 3 priomap 0 0 0 0 0 0 0 0 + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "out-tc 7 " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-tc (7)" + + log_test "tc sample out-tc" + + tc qdisc del dev $rp2 root handle 1: + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_out_tc_occ_test() +{ + local backlog pct occ + + RET=0 + + # Output traffic class occupancy is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + # Configure a shaper on egress to create congestion. + tc qdisc replace dev $rp2 root handle 1: \ + tbf rate 1Mbit burst 256k limit 1M + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 0 -d 1usec -p 1400 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q & + + # Allow congestion to reach steady state. + sleep 10 + + backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]') + + # Kill mausezahn. + { kill %% && wait %%; } 2>/dev/null + + psample_capture_stop + + # Record last congestion sample. + occ=$(grep -e "out-tc-occ " $CAPTURE_FILE | tail -n 1 | \ + cut -d ' ' -f 16) + + pct=$((100 * (occ - backlog) / backlog)) + (( -1 <= pct && pct <= 1)) + check_err $? "Recorded a congestion of $backlog bytes, but sampled congestion is $occ bytes, which is $pct% off. Required accuracy is +-5%" + + log_test "tc sample out-tc-occ" + + tc qdisc del dev $rp2 root handle 1: + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_latency_test() +{ + RET=0 + + # Egress sampling not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp2 egress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "latency " $CAPTURE_FILE + check_err $? "Sampled packets do not have latency attribute" + + log_test "tc sample latency" + + tc filter del dev $rp2 egress protocol all pref 1 handle 101 matchall +} + +tc_sample_acl_group_conflict_test() +{ + RET=0 + + # Test that two flower sampling rules cannot be configured on the same + # port with different groups. + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol ip pref 2 handle 102 flower \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule with same group" + + tc filter add dev $rp1 ingress protocol ip pref 3 handle 103 flower \ + skip_sw action sample rate 1024 group 2 &> /dev/null + check_fail $? "Managed to configure sampling rule with conflicting group" + + log_test "tc sample (w/ flower) group conflict test" + + tc filter del dev $rp1 ingress protocol ip pref 2 handle 102 flower + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower +} + +__tc_sample_acl_rate_test() +{ + local bind=$1; shift + local port=$1; shift + local pkts pct + + RET=0 + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 198.51.100.1 action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) + pct=$((100 * (pkts - 100) / 100)) + (( -25 <= pct && pct <= 25)) + check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + + # Setup a filter that should not match any packet and make sure packets + # are not sampled. + tc filter del dev $port $bind protocol ip pref 1 handle 101 flower + + tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 198.51.100.10 action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "group 1 " $CAPTURE_FILE + check_fail $? "Sampled packets when should not" + + log_test "tc sample (w/ flower) rate ($bind)" + + tc filter del dev $port $bind protocol ip pref 1 handle 101 flower +} + +tc_sample_acl_rate_test() +{ + __tc_sample_acl_rate_test ingress $rp1 + __tc_sample_acl_rate_test egress $rp2 +} + +tc_sample_acl_max_rate_test() +{ + RET=0 + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate $((2 ** 24 - 1)) group 1 + check_err $? "Failed to configure sampling rule with max rate" + + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate $((2 ** 24)) \ + group 1 &> /dev/null + check_fail $? "Managed to configure sampling rate above maximum" + + log_test "tc sample (w/ flower) maximum rate" +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh index 9f64d5c7107b..7ca1f030d209 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -24,8 +24,11 @@ function check { local code=$1 local str=$2 local exp_str=$3 + local exp_fail=$4 - if [ $code -ne 0 ]; then + [ -z "$exp_fail" ] && cop="-ne" || cop="-eq" + + if [ $code $cop 0 ]; then ((num_errors++)) return fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh new file mode 100755 index 000000000000..0c56746e9ce0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +source ethtool-common.sh + +NSIM_NETDEV=$(make_netdev) +[ a$ETHTOOL == a ] && ETHTOOL=ethtool + +set -o pipefail + +# netdevsim starts out with None/None +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: None +Active FEC encoding: None" + +# Test Auto +$ETHTOOL --set-fec $NSIM_NETDEV encoding auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto +Active FEC encoding: Off" + +# Test case in-sensitivity +for o in off Off OFF; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: Off +Active FEC encoding: Off" +done + +for o in BaseR baser BAser; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: BaseR +Active FEC encoding: BaseR" +done + +for o in llrs rs; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: ${o^^} +Active FEC encoding: ${o^^}" +done + +# Test mutliple bits +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs llrs +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: RS LLRS +Active FEC encoding: LLRS" + +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs off auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto Off RS +Active FEC encoding: RS" + +# Make sure other link modes are rejected +$ETHTOOL --set-fec $NSIM_NETDEV encoding FIBRE 2>/dev/null +check $? '' '' 1 + +$ETHTOOL --set-fec $NSIM_NETDEV encoding bla-bla-bla 2>/dev/null +check $? '' '' 1 + +# Try JSON +$ETHTOOL --json --show-fec $NSIM_NETDEV | jq empty >>/dev/null 2>&1 +if [ $? -eq 0 ]; then + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"Off"' + + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto RS + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto" +"RS"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"RS"' +fi + +# Test error injection +echo 11 > $NSIM_DEV_DFS/ethtool/get_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? '' '' 1 + +echo 0 > $NSIM_DEV_DFS/ethtool/get_err +echo 11 > $NSIM_DEV_DFS/ethtool/set_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? + +$ETHTOOL --set-fec $NSIM_NETDEV encoding RS 2>/dev/null +check $? '' '' 1 + +if [ $num_errors -eq 0 ]; then + echo "PASSED all $((num_passes)) checks" + exit 0 +else + echo "FAILED $num_errors/$((num_errors+num_passes)) checks" + exit 1 +fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh index be0c1b5ee6b8..ba75c81cda91 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh @@ -11,14 +11,33 @@ ALL_TESTS=" nexthop_single_add_err_test nexthop_group_add_test nexthop_group_add_err_test + nexthop_res_group_add_test + nexthop_res_group_add_err_test nexthop_group_replace_test nexthop_group_replace_err_test + nexthop_res_group_replace_test + nexthop_res_group_replace_err_test + nexthop_res_group_idle_timer_test + nexthop_res_group_idle_timer_del_test + nexthop_res_group_increase_idle_timer_test + nexthop_res_group_decrease_idle_timer_test + nexthop_res_group_unbalanced_timer_test + nexthop_res_group_unbalanced_timer_del_test + nexthop_res_group_no_unbalanced_timer_test + nexthop_res_group_short_unbalanced_timer_test + nexthop_res_group_increase_unbalanced_timer_test + nexthop_res_group_decrease_unbalanced_timer_test + nexthop_res_group_force_migrate_busy_test nexthop_single_replace_test nexthop_single_replace_err_test nexthop_single_in_group_replace_test nexthop_single_in_group_replace_err_test + nexthop_single_in_res_group_replace_test + nexthop_single_in_res_group_replace_err_test nexthop_single_in_group_delete_test nexthop_single_in_group_delete_err_test + nexthop_single_in_res_group_delete_test + nexthop_single_in_res_group_delete_err_test nexthop_replay_test nexthop_replay_err_test " @@ -27,6 +46,7 @@ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} DEVLINK_DEV=netdevsim/${DEV} SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +DEBUGFS_NET_DIR=/sys/kernel/debug/netdevsim/$DEV/ NUM_NETIFS=0 source $lib_dir/lib.sh source $lib_dir/devlink_lib.sh @@ -44,6 +64,28 @@ nexthop_check() return 0 } +nexthop_bucket_nhid_count_check() +{ + local group_id=$1; shift + local expected + local count + local nhid + local ret + + while (($# > 0)); do + nhid=$1; shift + expected=$1; shift + + count=$($IP nexthop bucket show id $group_id nhid $nhid | + grep "trap" | wc -l) + if ((expected != count)); then + return 1 + fi + done + + return 0 +} + nexthop_resource_check() { local expected_occ=$1; shift @@ -159,6 +201,71 @@ nexthop_group_add_err_test() nexthop_resource_set 9999 } +nexthop_res_group_add_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 2 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + $IP nexthop del id 10 + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy after delete" + + $IP nexthop add id 10 group 1,3/2,2 type resilient buckets 5 + nexthop_check "id 10" "id 10 group 1,3/2,2 type resilient buckets 5 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected weighted nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 3 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 7 + check_err $? "Wrong weighted nexthop occupancy" + + $IP nexthop del id 10 + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy after delete" + + log_test "Resilient nexthop group add and delete" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_add_err_test() +{ + RET=0 + + nexthop_resource_set 2 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + $IP nexthop add id 10 group 1/2 type resilient buckets 4 &> /dev/null + check_fail $? "Nexthop group addition succeeded when should fail" + + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy" + + log_test "Resilient nexthop group add failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + nexthop_group_replace_test() { RET=0 @@ -206,6 +313,411 @@ nexthop_group_replace_err_test() nexthop_resource_set 9999 } +nexthop_res_group_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 6 + + $IP nexthop replace id 10 group 1/2/3 type resilient + nexthop_check "id 10" "id 10 group 1/2/3 type resilient buckets 6 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 2 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 2 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 3 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 9 + check_err $? "Wrong nexthop occupancy" + + log_test "Resilient nexthop group replace" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_replace_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 6 + + ip netns exec testns1 \ + echo 1 > $DEBUGFS_NET_DIR/fib/fail_res_nexthop_group_replace + $IP nexthop replace id 10 group 1/2/3 type resilient &> /dev/null + check_fail $? "Nexthop group replacement succeeded when should fail" + + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 6 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry after failure" + + nexthop_bucket_nhid_count_check 10 1 3 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 3 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 9 + check_err $? "Wrong nexthop occupancy after failure" + + log_test "Resilient nexthop group replace failure" + + $IP nexthop flush &> /dev/null + ip netns exec testns1 \ + echo 0 > $DEBUGFS_NET_DIR/fib/fail_res_nexthop_group_replace +} + +nexthop_res_mark_buckets_busy() +{ + local group_id=$1; shift + local nhid=$1; shift + local count=$1; shift + local index + + for index in $($IP -j nexthop bucket show id $group_id nhid $nhid | + jq '.[].bucket.index' | head -n ${count:--0}) + do + echo $group_id $index \ + > $DEBUGFS_NET_DIR/fib/nexthop_bucket_activity + done +} + +nexthop_res_num_nhid_buckets() +{ + local group_id=$1; shift + local nhid=$1; shift + + $IP -j nexthop bucket show id $group_id nhid $nhid | jq length +} + +nexthop_res_group_idle_timer_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 idle_timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "Group expected to be unbalanced" + + sleep 6 + + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after idle timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_idle_timer_del_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1,50/2,50/3,1 \ + type resilient buckets 8 idle_timer 6 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1,50/2,150/3,1 type resilient + + nexthop_bucket_nhid_count_check 10 1 4 2 4 3 0 + check_err $? "Group expected to be unbalanced" + + sleep 4 + + # Deletion prompts group replacement. Check that the bucket timers + # are kept. + $IP nexthop delete id 3 + + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "Group expected to still be unbalanced" + + sleep 4 + + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after idle timer (with delete)" + + $IP nexthop flush &> /dev/null +} + +__nexthop_res_group_increase_timer_test() +{ + local timer=$1; shift + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 $timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 2 + $IP nexthop replace id 10 group 1/2,3 type resilient $timer 8 + sleep 4 + + # 6 seconds, past the original timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group still expected to be unbalanced" + + sleep 4 + + # 10 seconds, past the new timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after $timer increase" + + $IP nexthop flush &> /dev/null +} + +__nexthop_res_group_decrease_timer_test() +{ + local timer=$1; shift + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 $timer 8 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 2 + $IP nexthop replace id 10 group 1/2,3 type resilient $timer 4 + sleep 4 + + # 6 seconds, past the new timer, before the old timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after $timer decrease" + + $IP nexthop flush &> /dev/null +} + +__nexthop_res_group_increase_timer_del_test() +{ + local timer=$1; shift + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1,100/2,100/3,1 \ + type resilient buckets 8 $timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1,100/2,300/3,1 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 2 + $IP nexthop replace id 10 group 1/2,3 type resilient $timer 8 + sleep 4 + + # 6 seconds, past the original timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group still expected to be unbalanced" + + sleep 4 + + # 10 seconds, past the new timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after $timer increase" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_increase_idle_timer_test() +{ + __nexthop_res_group_increase_timer_test idle_timer +} + +nexthop_res_group_decrease_idle_timer_test() +{ + __nexthop_res_group_decrease_timer_test idle_timer +} + +nexthop_res_group_unbalanced_timer_test() +{ + local i + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient \ + buckets 8 idle_timer 6 unbalanced_timer 10 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + for i in 1 2; do + sleep 4 + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "$i: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + done + + # 3 x sleep 4 > unbalanced timer 10 + sleep 4 + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after unbalanced timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_unbalanced_timer_del_test() +{ + local i + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1,50/2,50/3,1 type resilient \ + buckets 8 idle_timer 6 unbalanced_timer 10 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1,50/2,150/3,1 type resilient + + # Check that NH delete does not reset unbalanced time. + sleep 4 + $IP nexthop delete id 3 + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "1: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + + sleep 4 + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "2: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + + # 3 x sleep 4 > unbalanced timer 10 + sleep 4 + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after unbalanced timer (with delete)" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_no_unbalanced_timer_test() +{ + local i + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + for i in $(seq 3); do + sleep 60 + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "$i: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + done + + log_test "Buckets never force-migrated without unbalanced timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_short_unbalanced_timer_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient \ + buckets 8 idle_timer 120 unbalanced_timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 5 + + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after unbalanced < idle timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_increase_unbalanced_timer_test() +{ + __nexthop_res_group_increase_timer_test unbalanced_timer +} + +nexthop_res_group_decrease_unbalanced_timer_test() +{ + __nexthop_res_group_decrease_timer_test unbalanced_timer +} + +nexthop_res_group_force_migrate_busy_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient \ + buckets 8 idle_timer 120 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + $IP nexthop replace id 10 group 2 type resilient + nexthop_bucket_nhid_count_check 10 2 8 + check_err $? "All buckets expected to have migrated" + + log_test "Busy buckets force-migrated when NH removed" + + $IP nexthop flush &> /dev/null +} + nexthop_single_replace_test() { RET=0 @@ -299,6 +811,63 @@ nexthop_single_in_group_replace_err_test() nexthop_resource_set 9999 } +nexthop_single_in_res_group_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + + $IP nexthop replace id 1 via 192.0.2.4 dev dummy1 + check_err $? "Failed to replace nexthop when should not" + + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 2 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace while in resilient group" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_in_res_group_replace_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + + ip netns exec testns1 \ + echo 1 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace + $IP nexthop replace id 1 via 192.0.2.4 dev dummy1 &> /dev/null + check_fail $? "Nexthop replacement succeeded when should fail" + + nexthop_check "id 1" "id 1 via 192.0.2.2 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry after failure" + + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry after failure" + + nexthop_bucket_nhid_count_check 10 1 2 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace while in resilient group failure" + + $IP nexthop flush &> /dev/null + ip netns exec testns1 \ + echo 0 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace +} + nexthop_single_in_group_delete_test() { RET=0 @@ -346,6 +915,57 @@ nexthop_single_in_group_delete_err_test() nexthop_resource_set 9999 } +nexthop_single_in_res_group_delete_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + + $IP nexthop del id 1 + nexthop_check "id 10" "id 10 group 2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 2 4 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 5 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop delete while in resilient group" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_in_res_group_delete_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2/3 type resilient buckets 6 + + ip netns exec testns1 \ + echo 1 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace + $IP nexthop del id 1 + + # We failed to replace the two nexthop buckets that were originally + # assigned to nhid 1. + nexthop_bucket_nhid_count_check 10 2 2 3 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 8 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop delete while in resilient group failure" + + $IP nexthop flush &> /dev/null + ip netns exec testns1 \ + echo 0 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace +} + nexthop_replay_test() { RET=0 diff --git a/tools/testing/selftests/drivers/net/netdevsim/psample.sh b/tools/testing/selftests/drivers/net/netdevsim/psample.sh new file mode 100755 index 000000000000..ee10b1a8933c --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/psample.sh @@ -0,0 +1,181 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test is for checking the psample module. It makes use of netdevsim +# which periodically generates "sampled" packets. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + psample_enable_test + psample_group_num_test + psample_md_test +" +NETDEVSIM_PATH=/sys/bus/netdevsim/ +DEV_ADDR=1337 +DEV=netdevsim${DEV_ADDR} +DEVLINK_DEV=netdevsim/${DEV} +SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +PSAMPLE_DIR=/sys/kernel/debug/netdevsim/$DEV/psample/ +CAPTURE_FILE=$(mktemp) +NUM_NETIFS=0 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +# Available at https://github.com/Mellanox/libpsample +require_command psample + +psample_capture() +{ + rm -f $CAPTURE_FILE + + timeout 2 ip netns exec testns1 psample &> $CAPTURE_FILE +} + +psample_enable_test() +{ + RET=0 + + echo 1 > $PSAMPLE_DIR/enable + check_err $? "Failed to enable sampling when should not" + + echo 1 > $PSAMPLE_DIR/enable 2>/dev/null + check_fail $? "Sampling enablement succeeded when should fail" + + psample_capture + if [ $(cat $CAPTURE_FILE | wc -l) -eq 0 ]; then + check_err 1 "Failed to capture sampled packets" + fi + + echo 0 > $PSAMPLE_DIR/enable + check_err $? "Failed to disable sampling when should not" + + echo 0 > $PSAMPLE_DIR/enable 2>/dev/null + check_fail $? "Sampling disablement succeeded when should fail" + + psample_capture + if [ $(cat $CAPTURE_FILE | wc -l) -ne 0 ]; then + check_err 1 "Captured sampled packets when should not" + fi + + log_test "psample enable / disable" +} + +psample_group_num_test() +{ + RET=0 + + echo 1234 > $PSAMPLE_DIR/group_num + echo 1 > $PSAMPLE_DIR/enable + + psample_capture + grep -q -e "group 1234" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong group number" + + # New group number should only be used after disable / enable. + echo 4321 > $PSAMPLE_DIR/group_num + + psample_capture + grep -q -e "group 4321" $CAPTURE_FILE + check_fail $? "Group number changed while sampling is active" + + echo 0 > $PSAMPLE_DIR/enable && echo 1 > $PSAMPLE_DIR/enable + + psample_capture + grep -q -e "group 4321" $CAPTURE_FILE + check_err $? "Group number did not change after restarting sampling" + + log_test "psample group number" + + echo 0 > $PSAMPLE_DIR/enable +} + +psample_md_test() +{ + RET=0 + + echo 1 > $PSAMPLE_DIR/enable + + echo 1234 > $PSAMPLE_DIR/in_ifindex + echo 4321 > $PSAMPLE_DIR/out_ifindex + psample_capture + + grep -q -e "in-ifindex 1234" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong in-ifindex" + + grep -q -e "out-ifindex 4321" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong out-ifindex" + + echo 5 > $PSAMPLE_DIR/out_tc + psample_capture + + grep -q -e "out-tc 5" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong out-tc" + + echo $((2**16 - 1)) > $PSAMPLE_DIR/out_tc + psample_capture + + grep -q -e "out-tc " $CAPTURE_FILE + check_fail $? "Sampled packets reported with out-tc when should not" + + echo 1 > $PSAMPLE_DIR/out_tc + echo 10000 > $PSAMPLE_DIR/out_tc_occ_max + psample_capture + + grep -q -e "out-tc-occ " $CAPTURE_FILE + check_err $? "Sampled packets not reported with out-tc-occ when should" + + echo 0 > $PSAMPLE_DIR/out_tc_occ_max + psample_capture + + grep -q -e "out-tc-occ " $CAPTURE_FILE + check_fail $? "Sampled packets reported with out-tc-occ when should not" + + echo 10000 > $PSAMPLE_DIR/latency_max + psample_capture + + grep -q -e "latency " $CAPTURE_FILE + check_err $? "Sampled packets not reported with latency when should" + + echo 0 > $PSAMPLE_DIR/latency_max + psample_capture + + grep -q -e "latency " $CAPTURE_FILE + check_fail $? "Sampled packets reported with latency when should not" + + log_test "psample metadata" + + echo 0 > $PSAMPLE_DIR/enable +} + +setup_prepare() +{ + modprobe netdevsim &> /dev/null + + echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device + while [ ! -d $SYSFS_NET_DIR ] ; do :; done + + set -e + + ip netns add testns1 + devlink dev reload $DEVLINK_DEV netns testns1 + + set +e +} + +cleanup() +{ + pre_cleanup + rm -f $CAPTURE_FILE + ip netns del testns1 + echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device + modprobe -r netdevsim &> /dev/null +} + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/firmware/fw_namespace.c b/tools/testing/selftests/firmware/fw_namespace.c index 5ebc1aec7923..0e393cb5f42d 100644 --- a/tools/testing/selftests/firmware/fw_namespace.c +++ b/tools/testing/selftests/firmware/fw_namespace.c @@ -95,7 +95,7 @@ static bool test_fw_in_ns(const char *fw_name, const char *sys_path, bool block_ } if (block_fw_in_parent_ns) umount("/lib/firmware"); - return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; + return WEXITSTATUS(status) == EXIT_SUCCESS; } if (unshare(CLONE_NEWNS) != 0) { diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 32b87cc77c8e..7bd7e776c266 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,10 +8,13 @@ /x86_64/debug_regs /x86_64/evmcs_test /x86_64/get_cpuid_test +/x86_64/get_msr_index_features /x86_64/kvm_pv_test +/x86_64/hyperv_clock /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test +/x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..cb95b5bace7b 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -include ../../../../scripts/Kbuild.include +include ../../../build/Build.include all: @@ -39,12 +39,15 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 2f2eeb8a1d86..5aadf84c91c0 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -108,7 +108,7 @@ static void run_test(uint32_t run) kvm_vm_elf_load(vm, program_invocation_name, 0, 0); vm_create_irqchip(vm); - fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run); + pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { vm_vcpu_add_default(vm, i, guest_code); payloads[i].vm = vm; @@ -124,7 +124,7 @@ static void run_test(uint32_t run) check_set_affinity(throw_away, &cpu_set); } } - fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run); + pr_debug("%s: [%d] all threads launched\n", __func__, run); sem_post(sem); for (i = 0; i < VCPU_NUM; ++i) check_join(threads[i], &b); @@ -147,16 +147,16 @@ int main(int argc, char **argv) if (pid == 0) run_test(i); /* This function always exits */ - fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i); + pr_debug("%s: [%d] waiting semaphore\n", __func__, i); sem_wait(sem); r = (rand() % DELAY_US_MAX) + 1; - fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r); + pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); usleep(r); r = waitpid(pid, &s, WNOHANG); TEST_ASSERT(r != pid, "%s: [%d] child exited unexpectedly status: [%d]", __func__, i, s); - fprintf(stderr, "%s: [%d] killing child\n", __func__, i); + pr_debug("%s: [%d] killing child\n", __func__, i); kill(pid, SIGKILL); } diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2d7eb6989e83..0f4258eaa629 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -16,6 +16,7 @@ #include "sparsebit.h" +#define KVM_DEV_PATH "/dev/kvm" #define KVM_MAX_VCPUS 512 /* @@ -133,6 +134,7 @@ void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void *arg); void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); +int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e5fbf16f725b..b8849a1aca79 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1697,11 +1697,16 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { int ret; - ret = ioctl(vm->fd, cmd, arg); + ret = _vm_ioctl(vm, cmd, arg); TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", cmd, ret, errno, strerror(errno)); } +int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +{ + return ioctl(vm->fd, cmd, arg); +} + /* * KVM system ioctl * diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 34465dc562d8..91ce1b5d480b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -10,8 +10,6 @@ #include "sparsebit.h" -#define KVM_DEV_PATH "/dev/kvm" - struct userspace_mem_region { struct kvm_userspace_memory_region region; struct sparsebit *unused_phy_pages; diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c new file mode 100644 index 000000000000..cb953df4d7d0 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_GET_MSR_INDEX_LIST and + * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +static int kvm_num_index_msrs(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", + r); + + r = list->nmsrs; + free(list); + return r; +} + +static void test_get_msr_index(void) +{ + int old_res, res, kvm_fd, r; + struct kvm_msr_list *list; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + old_res = kvm_num_index_msrs(kvm_fd, 0); + TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); + + if (old_res != 1) { + res = kvm_num_index_msrs(kvm_fd, 1); + TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); + TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + } + + list = malloc(sizeof(*list) + old_res * sizeof(list->indices[0])); + list->nmsrs = old_res; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + + TEST_ASSERT(r == 0, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", + r); + TEST_ASSERT(list->nmsrs == old_res, "Expecting nmsrs to be identical"); + free(list); + + close(kvm_fd); +} + +static int kvm_num_feature_msrs(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST probe, r: %i", + r); + + r = list->nmsrs; + free(list); + return r; +} + +struct kvm_msr_list *kvm_get_msr_feature_list(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + + TEST_ASSERT(r == 0, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", + r); + + return list; +} + +static void test_get_msr_feature(void) +{ + int res, old_res, i, kvm_fd; + struct kvm_msr_list *feature_list; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + old_res = kvm_num_feature_msrs(kvm_fd, 0); + TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); + + if (old_res != 1) { + res = kvm_num_feature_msrs(kvm_fd, 1); + TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); + TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + } + + feature_list = kvm_get_msr_feature_list(kvm_fd, old_res); + TEST_ASSERT(old_res == feature_list->nmsrs, + "Unmatching number of msr indexes"); + + for (i = 0; i < feature_list->nmsrs; i++) + kvm_get_feature_msr(feature_list->indices[i]); + + free(feature_list); + close(kvm_fd); +} + +int main(int argc, char *argv[]) +{ + if (kvm_check_cap(KVM_CAP_GET_MSR_FEATURES)) + test_get_msr_feature(); + + test_get_msr_index(); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c new file mode 100644 index 000000000000..7f1d2765572c --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V clocksources + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 + +/* Simplified mul_u64_u64_shr() */ +static inline u64 mul_u64_u64_shr64(u64 a, u64 b) +{ + union { + u64 ll; + struct { + u32 low, high; + } l; + } rm, rn, rh, a0, b0; + u64 c; + + a0.ll = a; + b0.ll = b; + + rm.ll = (u64)a0.l.low * b0.l.high; + rn.ll = (u64)a0.l.high * b0.l.low; + rh.ll = (u64)a0.l.high * b0.l.high; + + rh.l.low = c = rm.l.high + rn.l.high + rh.l.low; + rh.l.high = (c >> 32) + rh.l.high; + + return rh.ll; +} + +static inline void nop_loop(void) +{ + int i; + + for (i = 0; i < 1000000; i++) + asm volatile("nop"); +} + +static inline void check_tsc_msr_rdtsc(void) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY); + GUEST_ASSERT(tsc_freq > 0); + + /* First, check MSR-based clocksource */ + r1 = rdtsc(); + t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + nop_loop(); + r2 = rdtsc(); + t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + GUEST_ASSERT(r2 > r1 && t2 > t1); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100); +} + +static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page) +{ + return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset; +} + +static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page) +{ + u64 r1, r2, t1, t2; + + /* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */ + t1 = get_tscpage_ts(tsc_page); + r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + /* 10 ms tolerance */ + GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000); + nop_loop(); + + t2 = get_tscpage_ts(tsc_page); + r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000); +} + +static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa) +{ + u64 tsc_scale, tsc_offset; + + /* Set Guest OS id to enable Hyper-V emulation */ + GUEST_SYNC(1); + wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48); + GUEST_SYNC(2); + + check_tsc_msr_rdtsc(); + + GUEST_SYNC(3); + + /* Set up TSC page is disabled state, check that it's clean */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + + GUEST_SYNC(4); + + /* Set up TSC page is enabled state */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1); + GUEST_ASSERT(tsc_page->tsc_sequence != 0); + + GUEST_SYNC(5); + + check_tsc_msr_tsc_page(tsc_page); + + GUEST_SYNC(6); + + tsc_offset = tsc_page->tsc_offset; + /* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */ + + GUEST_SYNC(7); + /* Sanity check TSC page timestamp, it should be close to 0 */ + GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000); + + GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset); + + nop_loop(); + + /* + * Enable Re-enlightenment and check that TSC page stays constant across + * KVM_SET_CLOCK. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1); + tsc_offset = tsc_page->tsc_offset; + tsc_scale = tsc_page->tsc_scale; + GUEST_SYNC(8); + GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset); + GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale); + + GUEST_SYNC(9); + + check_tsc_msr_tsc_page(tsc_page); + + /* + * Disable re-enlightenment and TSC page, check that KVM doesn't update + * it anymore. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0); + wrmsr(HV_X64_MSR_REFERENCE_TSC, 0); + memset(tsc_page, 0, sizeof(*tsc_page)); + + GUEST_SYNC(10); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + + GUEST_DONE(); +} + +#define VCPU_ID 0 + +static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY); + TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); + + /* First, check MSR-based clocksource */ + r1 = rdtsc(); + t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + nop_loop(); + r2 = rdtsc(); + t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + + TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100, + "Elapsed time does not match (MSR=%ld, TSC=%ld)", + (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq); +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + vm_vaddr_t tsc_page_gva; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + run = vcpu_state(vm, VCPU_ID); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + tsc_page_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + memset(addr_gpa2hva(vm, tsc_page_gva), 0x0, getpagesize()); + TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, + "TSC page has to be page aligned\n"); + vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); + + host_check_tsc_msr_rdtsc(vm); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + /* Keep in sync with guest_main() */ + TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d\n", + stage); + goto out; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, + "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + /* Reset kvmclock triggering TSC page update */ + if (stage == 7 || stage == 8 || stage == 10) { + struct kvm_clock_data clock = {0}; + + vm_ioctl(vm, KVM_SET_CLOCK, &clock); + } + } + +out: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c new file mode 100644 index 000000000000..12c558fc8074 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_SET_BOOT_CPU_ID works as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#define _GNU_SOURCE /* for program_invocation_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define N_VCPU 2 +#define VCPU_ID0 0 +#define VCPU_ID1 1 + +static uint32_t get_bsp_flag(void) +{ + return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; +} + +static void guest_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT(get_bsp_flag() != 0); + + GUEST_DONE(); +} + +static void guest_not_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT(get_bsp_flag() == 0); + + GUEST_DONE(); +} + +static void test_set_boot_busy(struct kvm_vm *vm) +{ + int res; + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0); + TEST_ASSERT(res == -1 && errno == EBUSY, + "KVM_SET_BOOT_CPU_ID set while running vm"); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct ucall uc; + int stage; + + for (stage = 0; stage < 2; stage++) { + + vcpu_run(vm, vcpuid); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + test_set_boot_busy(vm); + break; + case UCALL_DONE: + TEST_ASSERT(stage == 1, + "Expected GUEST_DONE in stage 2, got stage %d", + stage); + break; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } + } +} + +static struct kvm_vm *create_vm(void) +{ + struct kvm_vm *vm; + uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * 2; + uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; + + pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); + vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_create_irqchip(vm); + + return vm; +} + +static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) +{ + if (bsp_code) + vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu); + else + vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu); + + vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); +} + +static void run_vm_bsp(uint32_t bsp_vcpu) +{ + struct kvm_vm *vm; + bool is_bsp_vcpu1 = bsp_vcpu == VCPU_ID1; + + vm = create_vm(); + + if (is_bsp_vcpu1) + vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + + add_x86_vcpu(vm, VCPU_ID0, !is_bsp_vcpu1); + add_x86_vcpu(vm, VCPU_ID1, is_bsp_vcpu1); + + run_vcpu(vm, VCPU_ID0); + run_vcpu(vm, VCPU_ID1); + + kvm_vm_free(vm); +} + +static void check_set_bsp_busy(void) +{ + struct kvm_vm *vm; + int res; + + vm = create_vm(); + + add_x86_vcpu(vm, VCPU_ID0, true); + add_x86_vcpu(vm, VCPU_ID1, false); + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set after adding vcpu"); + + run_vcpu(vm, VCPU_ID0); + run_vcpu(vm, VCPU_ID1); + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set to a terminated vcpu"); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + if (!kvm_check_cap(KVM_CAP_SET_BOOT_CPU_ID)) { + print_skip("set_boot_cpu_id not available"); + return 0; + } + + run_vm_bsp(VCPU_ID0); + run_vm_bsp(VCPU_ID1); + run_vm_bsp(VCPU_ID0); + + check_set_bsp_busy(); +} diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index a5ce26d548e4..0af84ad48aa7 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -1,6 +1,10 @@ # This mimics the top-level Makefile. We do it explicitly here so that this # Makefile can operate with or without the kbuild infrastructure. +ifneq ($(LLVM),) +CC := clang +else CC := $(CROSS_COMPILE)gcc +endif ifeq (0,$(MAKELEVEL)) ifeq ($(OUTPUT),) @@ -74,7 +78,8 @@ ifdef building_out_of_srctree rsync -aq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT); \ fi @if [ "X$(TEST_PROGS)" != "X" ]; then \ - $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(OUTPUT)/$(TEST_PROGS)) ; \ + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) \ + $(addprefix $(OUTPUT)/,$(TEST_PROGS))) ; \ else \ $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS)); \ fi diff --git a/tools/testing/selftests/lkdtm/.gitignore b/tools/testing/selftests/lkdtm/.gitignore index f26212605b6b..d4b0be857deb 100644 --- a/tools/testing/selftests/lkdtm/.gitignore +++ b/tools/testing/selftests/lkdtm/.gitignore @@ -1,2 +1,3 @@ *.sh !run.sh +!stack-entropy.sh diff --git a/tools/testing/selftests/lkdtm/Makefile b/tools/testing/selftests/lkdtm/Makefile index 1bcc9ee990eb..c71109ceeb2d 100644 --- a/tools/testing/selftests/lkdtm/Makefile +++ b/tools/testing/selftests/lkdtm/Makefile @@ -5,6 +5,7 @@ include ../lib.mk # NOTE: $(OUTPUT) won't get default value if used before lib.mk TEST_FILES := tests.txt +TEST_PROGS := stack-entropy.sh TEST_GEN_PROGS = $(patsubst %,$(OUTPUT)/%.sh,$(shell awk '{print $$1}' tests.txt | sed -e 's/\#//')) all: $(TEST_GEN_PROGS) diff --git a/tools/testing/selftests/lkdtm/stack-entropy.sh b/tools/testing/selftests/lkdtm/stack-entropy.sh new file mode 100755 index 000000000000..b1b8a5097cbb --- /dev/null +++ b/tools/testing/selftests/lkdtm/stack-entropy.sh @@ -0,0 +1,36 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Measure kernel stack entropy by sampling via LKDTM's REPORT_STACK test. +set -e +samples="${1:-1000}" + +# Capture dmesg continuously since it may fill up depending on sample size. +log=$(mktemp -t stack-entropy-XXXXXX) +dmesg --follow >"$log" & pid=$! +report=-1 +for i in $(seq 1 $samples); do + echo "REPORT_STACK" >/sys/kernel/debug/provoke-crash/DIRECT + if [ -t 1 ]; then + percent=$(( 100 * $i / $samples )) + if [ "$percent" -ne "$report" ]; then + /bin/echo -en "$percent%\r" + report="$percent" + fi + fi +done +kill "$pid" + +# Count unique offsets since last run. +seen=$(tac "$log" | grep -m1 -B"$samples"0 'Starting stack offset' | \ + grep 'Stack offset' | awk '{print $NF}' | sort | uniq -c | wc -l) +bits=$(echo "obase=2; $seen" | bc | wc -L) +echo "Bits of stack entropy: $bits" +rm -f "$log" + +# We would expect any functional stack randomization to be at least 5 bits. +if [ "$bits" -lt 5 ]; then + exit 1 +else + exit 0 +fi diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 25f198bec0b2..3915bb7bfc39 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -23,6 +23,8 @@ TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh +TEST_PROGS += udpgro_fwd.sh +TEST_PROGS += veth.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any @@ -37,6 +39,8 @@ TEST_GEN_FILES += ipsec TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls +TEST_FILES := settings + KSFT_KHDR_INSTALL := 1 include ../lib.mk diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index d98fb85e201c..49774a8a7736 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,10 +19,39 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" - -ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" +IPV4_TESTS=" + ipv4_fcnal + ipv4_grp_fcnal + ipv4_res_grp_fcnal + ipv4_withv6_fcnal + ipv4_fcnal_runtime + ipv4_large_grp + ipv4_large_res_grp + ipv4_compat_mode + ipv4_fdb_grp_fcnal + ipv4_torture + ipv4_res_torture +" + +IPV6_TESTS=" + ipv6_fcnal + ipv6_grp_fcnal + ipv6_res_grp_fcnal + ipv6_fcnal_runtime + ipv6_large_grp + ipv6_large_res_grp + ipv6_compat_mode + ipv6_fdb_grp_fcnal + ipv6_torture + ipv6_res_torture +" + +ALL_TESTS=" + basic + basic_res + ${IPV4_TESTS} + ${IPV6_TESTS} +" TESTS="${ALL_TESTS}" VERBOSE=0 PAUSE_ON_FAIL=no @@ -232,6 +261,19 @@ check_nexthop() check_output "${out}" "${expected}" } +check_nexthop_bucket() +{ + local nharg="$1" + local expected="$2" + local out + + # remove the idle time since we cannot match it + out=$($IP nexthop bucket ${nharg} \ + | sed s/idle_time\ [0-9.]*\ // 2>/dev/null) + + check_output "${out}" "${expected}" +} + check_route() { local pfx="$1" @@ -308,6 +350,25 @@ check_large_grp() log_test $? 0 "Dump large (x$ecmp) ecmp groups" } +check_large_res_grp() +{ + local ipv=$1 + local buckets=$2 + local ipstr="" + + if [ $ipv -eq 4 ]; then + ipstr="172.16.1.2" + else + ipstr="2001:db8:91::2" + fi + + # create a resilient group with $buckets buckets and dump them + run_cmd "$IP nexthop add id 100 via $ipstr dev veth1" + run_cmd "$IP nexthop add id 1000 group 100 type resilient buckets $buckets" + run_cmd "$IP nexthop bucket list" + log_test $? 0 "Dump large (x$buckets) nexthop buckets" +} + start_ip_monitor() { local mtype=$1 @@ -344,6 +405,15 @@ check_nexthop_fdb_support() fi } +check_nexthop_res_support() +{ + $IP nexthop help 2>&1 | grep -q resilient + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing resilient nexthop group support" + return $ksft_skip + fi +} + ipv6_fdb_grp_fcnal() { local rc @@ -666,6 +736,70 @@ ipv6_grp_fcnal() log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" } +ipv6_res_grp_fcnal() +{ + local rc + + echo + echo "IPv6 resilient groups functional" + echo "--------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # + # migration of nexthop buckets - equal weights + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 102 group 62/63 type resilient buckets 2 idle_timer 0" + + run_cmd "$IP nexthop del id 63" + check_nexthop "id 102" \ + "id 102 group 62 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 62 id 102 index 1 nhid 62" + log_test $? 0 "Nexthop buckets updated when entry is deleted" + + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 62/63 type resilient buckets 2 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 62/63 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 63 id 102 index 1 nhid 62" + log_test $? 0 "Nexthop buckets updated after replace" + + $IP nexthop flush >/dev/null 2>&1 + + # + # migration of nexthop buckets - unequal weights + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 102 group 62,3/63,1 type resilient buckets 4 idle_timer 0" + + run_cmd "$IP nexthop del id 63" + check_nexthop "id 102" \ + "id 102 group 62,3 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 62 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" + log_test $? 0 "Nexthop buckets updated when entry is deleted - nECMP" + + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 62,3/63,1 type resilient buckets 4 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 62,3/63 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" + log_test $? 0 "Nexthop buckets updated after replace - nECMP" +} + ipv6_fcnal_runtime() { local rc @@ -824,6 +958,22 @@ ipv6_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv6_large_res_grp() +{ + echo + echo "IPv6 large resilient group (128k buckets)" + echo "-----------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + check_large_res_grp 6 $((128 * 1024)) + + $IP nexthop flush >/dev/null 2>&1 +} + ipv6_del_add_loop1() { while :; do @@ -874,11 +1024,67 @@ ipv6_torture() sleep 300 kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null # if we did not crash, success log_test 0 0 "IPv6 torture test" } +ipv6_res_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 type resilient + done >/dev/null 2>&1 +} + +ipv6_res_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime resilient nexthop group torture" + echo "--------------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101 type resilient buckets 512 idle_timer 0" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_res_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn -6 veth1 \ + -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 \ + -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null + + # if we did not crash, success + log_test 0 0 "IPv6 resilient nexthop group torture test" +} ipv4_fcnal() { @@ -1038,6 +1244,70 @@ ipv4_grp_fcnal() log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" } +ipv4_res_grp_fcnal() +{ + local rc + + echo + echo "IPv4 resilient groups functional" + echo "--------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # + # migration of nexthop buckets - equal weights + # + run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop add id 102 group 12/13 type resilient buckets 2 idle_timer 0" + + run_cmd "$IP nexthop del id 13" + check_nexthop "id 102" \ + "id 102 group 12 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 12 id 102 index 1 nhid 12" + log_test $? 0 "Nexthop buckets updated when entry is deleted" + + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 12/13 type resilient buckets 2 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 12/13 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 13 id 102 index 1 nhid 12" + log_test $? 0 "Nexthop buckets updated after replace" + + $IP nexthop flush >/dev/null 2>&1 + + # + # migration of nexthop buckets - unequal weights + # + run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop add id 102 group 12,3/13,1 type resilient buckets 4 idle_timer 0" + + run_cmd "$IP nexthop del id 13" + check_nexthop "id 102" \ + "id 102 group 12,3 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 12 id 102 index 1 nhid 12 id 102 index 2 nhid 12 id 102 index 3 nhid 12" + log_test $? 0 "Nexthop buckets updated when entry is deleted - nECMP" + + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 12,3/13,1 type resilient buckets 4 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 12,3/13 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 13 id 102 index 1 nhid 12 id 102 index 2 nhid 12 id 102 index 3 nhid 12" + log_test $? 0 "Nexthop buckets updated after replace - nECMP" +} + ipv4_withv6_fcnal() { local lladdr @@ -1259,6 +1529,22 @@ ipv4_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv4_large_res_grp() +{ + echo + echo "IPv4 large resilient group (128k buckets)" + echo "-----------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + check_large_res_grp 4 $((128 * 1024)) + + $IP nexthop flush >/dev/null 2>&1 +} + sysctl_nexthop_compat_mode_check() { local sysctlname="net.ipv4.nexthop_compat_mode" @@ -1476,11 +1762,68 @@ ipv4_torture() sleep 300 kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null # if we did not crash, success log_test 0 0 "IPv4 torture test" } +ipv4_res_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 type resilient + done >/dev/null 2>&1 +} + +ipv4_res_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime resilient nexthop group torture" + echo "--------------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101 type resilient buckets 512 idle_timer 0" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_res_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 \ + -B 172.16.101.2 -A 172.16.1.1 -c 0 \ + -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null + + # if we did not crash, success + log_test 0 0 "IPv4 resilient nexthop group torture test" +} + basic() { echo @@ -1590,6 +1933,219 @@ basic() log_test $? 2 "Nexthop group and blackhole" $IP nexthop flush >/dev/null 2>&1 + + # Test to ensure that flushing with a multi-part nexthop dump works as + # expected. + local batch_file=$(mktemp) + + for i in $(seq 1 $((64 * 1024))); do + echo "nexthop add id $i blackhole" >> $batch_file + done + + $IP -b $batch_file + $IP nexthop flush >/dev/null 2>&1 + [[ $($IP nexthop | wc -l) -eq 0 ]] + log_test $? 0 "Large scale nexthop flushing" + + rm $batch_file +} + +check_nexthop_buckets_balance() +{ + local nharg=$1; shift + local ret + + while (($# > 0)); do + local selector=$1; shift + local condition=$1; shift + local count + + count=$($IP -j nexthop bucket ${nharg} ${selector} | jq length) + (( $count $condition )) + ret=$? + if ((ret != 0)); then + return $ret + fi + done + + return 0 +} + +basic_res() +{ + echo + echo "Basic resilient nexthop group functional tests" + echo "----------------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 1 dev veth1" + + # + # resilient nexthop group addition + # + + run_cmd "$IP nexthop add id 101 group 1 type resilient buckets 8" + log_test $? 0 "Add a nexthop group with default parameters" + + run_cmd "$IP nexthop get id 101" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 120 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Get a nexthop group with default parameters" + + run_cmd "$IP nexthop add id 102 group 1 type resilient + buckets 4 idle_timer 100 unbalanced_timer 5" + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" \ + "id 102 group 1 type resilient buckets 4 idle_timer 100 unbalanced_timer 5 unbalanced_time 0" + log_test $? 0 "Get a nexthop group with non-default parameters" + + run_cmd "$IP nexthop add id 103 group 1 type resilient buckets 0" + log_test $? 2 "Add a nexthop group with 0 buckets" + + # + # resilient nexthop group replacement + # + + run_cmd "$IP nexthop replace id 101 group 1 type resilient + buckets 8 idle_timer 240 unbalanced_timer 80" + log_test $? 0 "Replace nexthop group parameters" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 240 unbalanced_timer 80 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing parameters" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient idle_timer 512" + log_test $? 0 "Replace idle timer" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 80 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing idle timer" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient unbalanced_timer 256" + log_test $? 0 "Replace unbalanced timer" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 256 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing unbalanced timer" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient" + log_test $? 0 "Replace with no parameters" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 256 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing no parameters" + + run_cmd "$IP nexthop replace id 101 group 1" + log_test $? 2 "Replace nexthop group type - implicit" + + run_cmd "$IP nexthop replace id 101 group 1 type mpath" + log_test $? 2 "Replace nexthop group type - explicit" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient buckets 1024" + log_test $? 2 "Replace number of nexthop buckets" + + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 256 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing with invalid parameters" + + # + # resilient nexthop buckets dump + # + + $IP nexthop flush >/dev/null 2>&1 + run_cmd "$IP nexthop add id 1 dev veth1" + run_cmd "$IP nexthop add id 2 dev veth3" + run_cmd "$IP nexthop add id 101 group 1/2 type resilient buckets 4" + run_cmd "$IP nexthop add id 201 group 1/2" + + check_nexthop_bucket "" \ + "id 101 index 0 nhid 2 id 101 index 1 nhid 2 id 101 index 2 nhid 1 id 101 index 3 nhid 1" + log_test $? 0 "Dump all nexthop buckets" + + check_nexthop_bucket "list id 101" \ + "id 101 index 0 nhid 2 id 101 index 1 nhid 2 id 101 index 2 nhid 1 id 101 index 3 nhid 1" + log_test $? 0 "Dump all nexthop buckets in a group" + + (( $($IP -j nexthop bucket list id 101 | + jq '[.[] | select(.bucket.idle_time > 0 and + .bucket.idle_time < 2)] | length') == 4 )) + log_test $? 0 "All nexthop buckets report a positive near-zero idle time" + + check_nexthop_bucket "list dev veth1" \ + "id 101 index 2 nhid 1 id 101 index 3 nhid 1" + log_test $? 0 "Dump all nexthop buckets with a specific nexthop device" + + check_nexthop_bucket "list nhid 2" \ + "id 101 index 0 nhid 2 id 101 index 1 nhid 2" + log_test $? 0 "Dump all nexthop buckets with a specific nexthop identifier" + + run_cmd "$IP nexthop bucket list id 111" + log_test $? 2 "Dump all nexthop buckets in a non-existent group" + + run_cmd "$IP nexthop bucket list id 201" + log_test $? 2 "Dump all nexthop buckets in a non-resilient group" + + run_cmd "$IP nexthop bucket list dev bla" + log_test $? 255 "Dump all nexthop buckets using a non-existent device" + + run_cmd "$IP nexthop bucket list groups" + log_test $? 255 "Dump all nexthop buckets with invalid 'groups' keyword" + + run_cmd "$IP nexthop bucket list fdb" + log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword" + + # + # resilient nexthop buckets get requests + # + + check_nexthop_bucket "get id 101 index 0" "id 101 index 0 nhid 2" + log_test $? 0 "Get a valid nexthop bucket" + + run_cmd "$IP nexthop bucket get id 101 index 999" + log_test $? 2 "Get a nexthop bucket with valid group, but invalid index" + + run_cmd "$IP nexthop bucket get id 201 index 0" + log_test $? 2 "Get a nexthop bucket from a non-resilient group" + + run_cmd "$IP nexthop bucket get id 999 index 0" + log_test $? 2 "Get a nexthop bucket from a non-existent group" + + # + # tests for bucket migration + # + + $IP nexthop flush >/dev/null 2>&1 + + run_cmd "$IP nexthop add id 1 dev veth1" + run_cmd "$IP nexthop add id 2 dev veth3" + run_cmd "$IP nexthop add id 101 + group 1/2 type resilient buckets 10 + idle_timer 1 unbalanced_timer 20" + + check_nexthop_buckets_balance "list id 101" \ + "nhid 1" "== 5" \ + "nhid 2" "== 5" + log_test $? 0 "Initial bucket allocation" + + run_cmd "$IP nexthop replace id 101 + group 1,2/2,3 type resilient" + check_nexthop_buckets_balance "list id 101" \ + "nhid 1" "== 4" \ + "nhid 2" "== 6" + log_test $? 0 "Bucket allocation after replace" + + # Check that increase in idle timer does not make buckets appear busy. + run_cmd "$IP nexthop replace id 101 + group 1,2/2,3 type resilient + idle_timer 10" + run_cmd "$IP nexthop replace id 101 + group 1/2 type resilient" + check_nexthop_buckets_balance "list id 101" \ + "nhid 1" "== 5" \ + "nhid 2" "== 5" + log_test $? 0 "Buckets migrated after idle timer change" + + $IP nexthop flush >/dev/null 2>&1 } ################################################################################ diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 2b5707738609..76d9487fb03c 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,7 +9,7 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr" +TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1653,6 +1653,154 @@ ipv4_route_v6_gw_test() route_cleanup } +socat_check() +{ + if [ ! -x "$(command -v socat)" ]; then + echo "socat command not found. Skipping test" + return 1 + fi + + return 0 +} + +iptables_check() +{ + iptables -t mangle -L OUTPUT &> /dev/null + if [ $? -ne 0 ]; then + echo "iptables configuration not supported. Skipping test" + return 1 + fi + + return 0 +} + +ip6tables_check() +{ + ip6tables -t mangle -L OUTPUT &> /dev/null + if [ $? -ne 0 ]; then + echo "ip6tables configuration not supported. Skipping test" + return 1 + fi + + return 0 +} + +ipv4_mangle_test() +{ + local rc + + echo + echo "IPv4 mangling tests" + + socat_check || return 1 + iptables_check || return 1 + + route_setup + sleep 2 + + local tmp_file=$(mktemp) + ip netns exec ns2 socat UDP4-LISTEN:54321,fork $tmp_file & + + # Add a FIB rule and a route that will direct our connection to the + # listening server. + $IP rule add pref 100 ipproto udp sport 12345 dport 54321 table 123 + $IP route add table 123 172.16.101.0/24 dev veth1 + + # Add an unreachable route to the main table that will block our + # connection in case the FIB rule is not hit. + $IP route add unreachable 172.16.101.2/32 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters" + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=11111" + log_test $? 1 " Connection with incorrect parameters" + + # Add a mangling rule and make sure connection is still successful. + $NS_EXEC iptables -t mangle -A OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - mangling" + + # Delete the mangling rule and make sure connection is still + # successful. + $NS_EXEC iptables -t mangle -D OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - no mangling" + + # Verify connections were indeed successful on server side. + [[ $(cat $tmp_file | wc -l) -eq 3 ]] + log_test $? 0 " Connection check - server side" + + $IP route del unreachable 172.16.101.2/32 + $IP route del table 123 172.16.101.0/24 dev veth1 + $IP rule del pref 100 + + { kill %% && wait %%; } 2>/dev/null + rm $tmp_file + + route_cleanup +} + +ipv6_mangle_test() +{ + local rc + + echo + echo "IPv6 mangling tests" + + socat_check || return 1 + ip6tables_check || return 1 + + route_setup + sleep 2 + + local tmp_file=$(mktemp) + ip netns exec ns2 socat UDP6-LISTEN:54321,fork $tmp_file & + + # Add a FIB rule and a route that will direct our connection to the + # listening server. + $IP -6 rule add pref 100 ipproto udp sport 12345 dport 54321 table 123 + $IP -6 route add table 123 2001:db8:101::/64 dev veth1 + + # Add an unreachable route to the main table that will block our + # connection in case the FIB rule is not hit. + $IP -6 route add unreachable 2001:db8:101::2/128 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters" + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=11111" + log_test $? 1 " Connection with incorrect parameters" + + # Add a mangling rule and make sure connection is still successful. + $NS_EXEC ip6tables -t mangle -A OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - mangling" + + # Delete the mangling rule and make sure connection is still + # successful. + $NS_EXEC ip6tables -t mangle -D OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - no mangling" + + # Verify connections were indeed successful on server side. + [[ $(cat $tmp_file | wc -l) -eq 3 ]] + log_test $? 0 " Connection check - server side" + + $IP -6 route del unreachable 2001:db8:101::2/128 + $IP -6 route del table 123 2001:db8:101::/64 dev veth1 + $IP -6 rule del pref 100 + + { kill %% && wait %%; } 2>/dev/null + rm $tmp_file + + route_cleanup +} + ################################################################################ # usage @@ -1725,6 +1873,8 @@ do ipv6_route_metrics) ipv6_route_metrics_test;; ipv4_route_metrics) ipv4_route_metrics_test;; ipv4_route_v6_gw) ipv4_route_v6_gw_test;; + ipv4_mangle) ipv4_mangle_test;; + ipv6_mangle) ipv6_mangle_test;; help) echo "Test names: $TESTS"; exit 0;; esac diff --git a/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh b/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh new file mode 100755 index 000000000000..5148d97a5df8 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh @@ -0,0 +1,366 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +----------------------+ +# | H1 (vrf) | | H2 (vrf) | +# | + h1.10 | | + h2.20 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | | | | | +# | + $h1 | | + $h2 | +# | | | | | | +# +----|---------------+ +--|-------------------+ +# | | +# +----|--------------------------------------------------|--------------------+ +# | SW | | | +# | +--|-------------------------------+ +----------------|------------------+ | +# | | + $swp1 BR1 (802.1ad) | | BR2 (802.1d) + $swp2 | | +# | | vid 100 pvid untagged | | | | | +# | | | | + $swp2.20 | | +# | | | | | | +# | | + vx100 (vxlan) | | + vx200 (vxlan) | | +# | | local 192.0.2.17 | | local 192.0.2.17 | | +# | | remote 192.0.2.34 | | remote 192.0.2.50 | | +# | | id 1000 dstport $VXPORT | | id 2000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | +# | +--------------------------------- + +-----------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | 192.0.2.48/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +----|-----------------------------------------------------------------------+ +# | +# +----|--------------------------------------------------------+ +# | | VRP2 (vrf) | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | (maybe) HW +# ============================================================================= +# | | (likely) SW +# | + v1 (veth) + v3 (veth) | +# | | 192.0.2.33/28 | 192.0.2.49/28 | +# +----|---------------------------------------|----------------+ +# | | +# +----|------------------------------+ +----|------------------------------+ +# | + v2 (veth) NS1 (netns) | | + v4 (veth) NS2 (netns) | +# | 192.0.2.34/28 | | 192.0.2.50/28 | +# | | | | +# | 192.0.2.16/28 via 192.0.2.33 | | 192.0.2.16/28 via 192.0.2.49 | +# | 192.0.2.50/32 via 192.0.2.33 | | 192.0.2.34/32 via 192.0.2.49 | +# | | | | +# | +-------------------------------+ | | +-------------------------------+ | +# | | BR3 (802.1ad) | | | | BR3 (802.1d) | | +# | | + vx100 (vxlan) | | | | + vx200 (vxlan) | | +# | | local 192.0.2.34 | | | | local 192.0.2.50 | | +# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | | +# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | | +# | | id 1000 dstport $VXPORT | | | | id 2000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | | | +# | | | | | | + w1.20 | | +# | | | | | | | | | +# | | + w1 (veth) | | | | + w1 (veth) | | +# | | | vid 100 pvid untagged | | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | VW2 (vrf) | | | | | VW2 (vrf) | | +# | | + w2 (veth) | | | | + w2 (veth) | | +# | | | | | | | | | | +# | | | | | | | | | | +# | | + w2.10 | | | | + w2.20 | | +# | | 192.0.2.3/28 | | | | 192.0.2.4/28 | | +# | +-------------------------------+ | | +-------------------------------+ | +# +-----------------------------------+ +-----------------------------------+ + +: ${VXPORT:=4789} +export VXPORT + +: ${ALL_TESTS:=" + ping_ipv4 + "} + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 + tc qdisc add dev $h1 clsact + vlan_create $h1 10 v$h1 192.0.2.1/28 +} + +h1_destroy() +{ + vlan_destroy $h1 10 + tc qdisc del dev $h1 clsact + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + tc qdisc add dev $h2 clsact + vlan_create $h2 20 v$h2 192.0.2.2/28 +} + +h2_destroy() +{ + vlan_destroy $h2 20 + tc qdisc del dev $h2 clsact + simple_if_fini $h2 +} + +rp1_set_addr() +{ + ip address add dev $rp1 192.0.2.17/28 + + ip route add 192.0.2.32/28 nexthop via 192.0.2.18 + ip route add 192.0.2.48/28 nexthop via 192.0.2.18 +} + +rp1_unset_addr() +{ + ip route del 192.0.2.48/28 nexthop via 192.0.2.18 + ip route del 192.0.2.32/28 nexthop via 192.0.2.18 + + ip address del dev $rp1 192.0.2.17/28 +} + +switch_create() +{ + #### BR1 #### + ip link add name br1 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br1 address $(mac_get $swp1) + ip link set dev br1 up + + #### BR2 #### + ip link add name br2 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br2 address $(mac_get $swp2) + ip link set dev br2 up + + ip link set dev $rp1 up + rp1_set_addr + + #### VX100 #### + ip link add name vx100 type vxlan id 1000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx100 up + + ip link set dev vx100 master br1 + bridge vlan add vid 100 dev vx100 pvid untagged + + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + bridge vlan add vid 100 dev $swp1 pvid untagged + + #### VX200 #### + ip link add name vx200 type vxlan id 2000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx200 up + + ip link set dev vx200 master br2 + + ip link set dev $swp2 up + ip link add name $swp2.20 link $swp2 type vlan id 20 + ip link set dev $swp2.20 master br2 + ip link set dev $swp2.20 up + + bridge fdb append dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + bridge fdb append dev vx200 00:00:00:00:00:00 dst 192.0.2.50 self +} + +switch_destroy() +{ + bridge fdb del dev vx200 00:00:00:00:00:00 dst 192.0.2.50 self + bridge fdb del dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + + ip link set dev vx200 nomaster + ip link set dev vx200 down + ip link del dev vx200 + + ip link del dev $swp2.20 + ip link set dev $swp2 down + ip link set dev $swp2 nomaster + + bridge vlan del vid 100 dev $swp1 + ip link set dev $swp1 down + ip link set dev $swp1 nomaster + + ip link set dev vx100 nomaster + ip link set dev vx100 down + ip link del dev vx100 + + rp1_unset_addr + ip link set dev $rp1 down + + ip link set dev br2 down + ip link del dev br2 + + ip link set dev br1 down + ip link del dev br1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + __simple_if_init v1 v$rp2 192.0.2.33/28 + __simple_if_init v3 v$rp2 192.0.2.49/28 + tc qdisc add dev v1 clsact +} + +vrp2_destroy() +{ + tc qdisc del dev v1 clsact + __simple_if_fini v3 192.0.2.49/28 + __simple_if_fini v1 192.0.2.33/28 + simple_if_fini $rp2 192.0.2.18/28 +} + +ns_init_common() +{ + local in_if=$1; shift + local in_addr=$1; shift + local other_in_addr=$1; shift + local vxlan_name=$1; shift + local vxlan_id=$1; shift + local vlan_id=$1; shift + local host_addr=$1; shift + local nh_addr=$1; shift + + ip link set dev $in_if up + ip address add dev $in_if $in_addr/28 + tc qdisc add dev $in_if clsact + + ip link add name br3 type bridge vlan_filtering 0 + ip link set dev br3 up + + ip link add name w1 type veth peer name w2 + + ip link set dev w1 master br3 + ip link set dev w1 up + + ip link add name $vxlan_name type vxlan id $vxlan_id local $in_addr \ + dstport "$VXPORT" + ip link set dev $vxlan_name up + bridge fdb append dev $vxlan_name 00:00:00:00:00:00 dst 192.0.2.17 self + bridge fdb append dev $vxlan_name 00:00:00:00:00:00 dst $other_in_addr self + + ip link set dev $vxlan_name master br3 + tc qdisc add dev $vxlan_name clsact + + simple_if_init w2 + vlan_create w2 $vlan_id vw2 $host_addr/28 + + ip route add 192.0.2.16/28 nexthop via $nh_addr + ip route add $other_in_addr/32 nexthop via $nh_addr +} +export -f ns_init_common + +ns1_create() +{ + ip netns add ns1 + ip link set dev v2 netns ns1 + in_ns ns1 \ + ns_init_common v2 192.0.2.34 192.0.2.50 vx100 1000 10 192.0.2.3 \ + 192.0.2.33 + + in_ns ns1 bridge vlan add vid 100 dev vx100 pvid untagged +} + +ns1_destroy() +{ + ip netns exec ns1 ip link set dev v2 netns 1 + ip netns del ns1 +} + +ns2_create() +{ + ip netns add ns2 + ip link set dev v4 netns ns2 + in_ns ns2 \ + ns_init_common v4 192.0.2.50 192.0.2.34 vx200 2000 20 192.0.2.4 \ + 192.0.2.49 + + in_ns ns2 ip link add name w1.20 link w1 type vlan id 20 + in_ns ns2 ip link set dev w1.20 master br3 + in_ns ns2 ip link set dev w1.20 up +} + +ns2_destroy() +{ + ip netns exec ns2 ip link set dev v4 netns 1 + ip netns del ns2 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1=${NETIFS[p5]} + rp2=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + switch_create + + ip link add name v1 type veth peer name v2 + ip link add name v3 type veth peer name v4 + vrp2_create + ns1_create + ns2_create + + r1_mac=$(in_ns ns1 mac_get w2) + r2_mac=$(in_ns ns2 mac_get w2) + h2_mac=$(mac_get $h2) +} + +cleanup() +{ + pre_cleanup + + ns2_destroy + ns1_destroy + vrp2_destroy + ip link del dev v3 + ip link del dev v1 + + switch_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.3 ": local->remote 1 through VxLAN with an 802.1ad bridge" + ping_test $h2 192.0.2.4 ": local->remote 2 through VxLAN with an 802.1d bridge" +} + +test_all() +{ + echo "Running tests with UDP port $VXPORT" + tests_run +} + +trap cleanup EXIT + +setup_prepare +setup_wait +test_all + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh index 66496659bea7..e134a5f529c9 100644 --- a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh +++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh @@ -224,7 +224,7 @@ fib_ipv4_plen_test() ip -n $ns link set dev dummy1 up # Add two routes with the same key and different prefix length and - # make sure both are in hardware. It can be verfied that both are + # make sure both are in hardware. It can be verified that both are # sharing the same leaf by checking the /proc/net/fib_trie ip -n $ns route add 192.0.2.0/24 dev dummy1 ip -n $ns route add 192.0.2.0/25 dev dummy1 diff --git a/tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh b/tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh new file mode 100755 index 000000000000..088b65e64d66 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh @@ -0,0 +1,361 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test traffic distribution when a wECMP route forwards traffic to two GRE +# tunnels. +# +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.1/28 | | +# | 2001:db8:1::1/64 | | +# +-------------------|-----+ +# | +# +-------------------|------------------------+ +# | SW1 | | +# | $ol1 + | +# | 192.0.2.2/28 | +# | 2001:db8:1::2/64 | +# | | +# | + g1a (gre) + g1b (gre) | +# | loc=192.0.2.65 loc=192.0.2.81 | +# | rem=192.0.2.66 --. rem=192.0.2.82 --. | +# | tos=inherit | tos=inherit | | +# | .------------------' | | +# | | .------------------' | +# | v v | +# | + $ul1.111 (vlan) + $ul1.222 (vlan) | +# | | 192.0.2.129/28 | 192.0.2.145/28 | +# | \ / | +# | \________________/ | +# | | | +# | + $ul1 | +# +------------|-------------------------------+ +# | +# +------------|-------------------------------+ +# | SW2 + $ul2 | +# | _______|________ | +# | / \ | +# | / \ | +# | + $ul2.111 (vlan) + $ul2.222 (vlan) | +# | ^ 192.0.2.130/28 ^ 192.0.2.146/28 | +# | | | | +# | | '------------------. | +# | '------------------. | | +# | + g2a (gre) | + g2b (gre) | | +# | loc=192.0.2.66 | loc=192.0.2.82 | | +# | rem=192.0.2.65 --' rem=192.0.2.81 --' | +# | tos=inherit tos=inherit | +# | | +# | $ol2 + | +# | 192.0.2.17/28 | | +# | 2001:db8:2::1/64 | | +# +-------------------|------------------------+ +# | +# +-------------------|-----+ +# | H2 | | +# | $h2 + | +# | 192.0.2.18/28 | +# | 2001:db8:2::2/64 | +# +-------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + multipath_ipv4 + multipath_ipv6 + multipath_ipv6_l4 +" + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64 + ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2 + ip route add vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2 +} + +h1_destroy() +{ + ip route del vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2 + ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2 + simple_if_fini $h1 192.0.2.1/28 +} + +sw1_create() +{ + simple_if_init $ol1 192.0.2.2/28 2001:db8:1::2/64 + __simple_if_init $ul1 v$ol1 + vlan_create $ul1 111 v$ol1 192.0.2.129/28 + vlan_create $ul1 222 v$ol1 192.0.2.145/28 + + tunnel_create g1a gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1 + __simple_if_init g1a v$ol1 192.0.2.65/32 + ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130 + + tunnel_create g1b gre 192.0.2.81 192.0.2.82 tos inherit dev v$ol1 + __simple_if_init g1b v$ol1 192.0.2.81/32 + ip route add vrf v$ol1 192.0.2.82/32 via 192.0.2.146 + + ip -6 nexthop add id 101 dev g1a + ip -6 nexthop add id 102 dev g1b + ip nexthop add id 103 group 101/102 type resilient buckets 512 \ + idle_timer 0 + + ip route add vrf v$ol1 192.0.2.16/28 nhid 103 + ip route add vrf v$ol1 2001:db8:2::/64 nhid 103 +} + +sw1_destroy() +{ + ip route del vrf v$ol1 2001:db8:2::/64 + ip route del vrf v$ol1 192.0.2.16/28 + + ip nexthop del id 103 + ip -6 nexthop del id 102 + ip -6 nexthop del id 101 + + ip route del vrf v$ol1 192.0.2.82/32 via 192.0.2.146 + __simple_if_fini g1b 192.0.2.81/32 + tunnel_destroy g1b + + ip route del vrf v$ol1 192.0.2.66/32 via 192.0.2.130 + __simple_if_fini g1a 192.0.2.65/32 + tunnel_destroy g1a + + vlan_destroy $ul1 222 + vlan_destroy $ul1 111 + __simple_if_fini $ul1 + simple_if_fini $ol1 192.0.2.2/28 2001:db8:1::2/64 +} + +sw2_create() +{ + simple_if_init $ol2 192.0.2.17/28 2001:db8:2::1/64 + __simple_if_init $ul2 v$ol2 + vlan_create $ul2 111 v$ol2 192.0.2.130/28 + vlan_create $ul2 222 v$ol2 192.0.2.146/28 + + tunnel_create g2a gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol2 + __simple_if_init g2a v$ol2 192.0.2.66/32 + ip route add vrf v$ol2 192.0.2.65/32 via 192.0.2.129 + + tunnel_create g2b gre 192.0.2.82 192.0.2.81 tos inherit dev v$ol2 + __simple_if_init g2b v$ol2 192.0.2.82/32 + ip route add vrf v$ol2 192.0.2.81/32 via 192.0.2.145 + + ip -6 nexthop add id 201 dev g2a + ip -6 nexthop add id 202 dev g2b + ip nexthop add id 203 group 201/202 type resilient buckets 512 \ + idle_timer 0 + + ip route add vrf v$ol2 192.0.2.0/28 nhid 203 + ip route add vrf v$ol2 2001:db8:1::/64 nhid 203 + + tc qdisc add dev $ul2 clsact + tc filter add dev $ul2 ingress pref 111 prot 802.1Q \ + flower vlan_id 111 action pass + tc filter add dev $ul2 ingress pref 222 prot 802.1Q \ + flower vlan_id 222 action pass +} + +sw2_destroy() +{ + tc qdisc del dev $ul2 clsact + + ip route del vrf v$ol2 2001:db8:1::/64 + ip route del vrf v$ol2 192.0.2.0/28 + + ip nexthop del id 203 + ip -6 nexthop del id 202 + ip -6 nexthop del id 201 + + ip route del vrf v$ol2 192.0.2.81/32 via 192.0.2.145 + __simple_if_fini g2b 192.0.2.82/32 + tunnel_destroy g2b + + ip route del vrf v$ol2 192.0.2.65/32 via 192.0.2.129 + __simple_if_fini g2a 192.0.2.66/32 + tunnel_destroy g2a + + vlan_destroy $ul2 222 + vlan_destroy $ul2 111 + __simple_if_fini $ul2 + simple_if_fini $ol2 192.0.2.17/28 2001:db8:2::1/64 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.18/28 2001:db8:2::2/64 + ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17 + ip route add vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1 +} + +h2_destroy() +{ + ip route del vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1 + ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17 + simple_if_fini $h2 192.0.2.18/28 2001:db8:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + ol1=${NETIFS[p2]} + + ul1=${NETIFS[p3]} + ul2=${NETIFS[p4]} + + ol2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + vrf_prepare + h1_create + sw1_create + sw2_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + sw2_destroy + sw1_destroy + h1_destroy + vrf_cleanup +} + +multipath4_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv4.fib_multipath_hash_policy 1 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 \ + type resilient + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + ip vrf exec v$h1 \ + $MZ $h1 -q -p 64 -A 192.0.2.1 -B 192.0.2.18 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 type resilient + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +multipath6_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv6.fib_multipath_hash_policy 0 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 \ + type resilient + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + # Generate 16384 echo requests, each with a random flow label. + for ((i=0; i < 16384; ++i)); do + ip vrf exec v$h1 $PING6 2001:db8:2::2 -F 0 -c 1 -q &> /dev/null + done + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 type resilient + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +multipath6_l4_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv6.fib_multipath_hash_policy 1 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 \ + type resilient + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + ip vrf exec v$h1 \ + $MZ $h1 -6 -q -p 64 -A 2001:db8:1::1 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 type resilient + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.18 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +multipath_ipv4() +{ + log_info "Running IPv4 multipath tests" + multipath4_test "ECMP" 1 1 + multipath4_test "Weighted MP 2:1" 2 1 + multipath4_test "Weighted MP 11:45" 11 45 +} + +multipath_ipv6() +{ + log_info "Running IPv6 multipath tests" + multipath6_test "ECMP" 1 1 + multipath6_test "Weighted MP 2:1" 2 1 + multipath6_test "Weighted MP 11:45" 11 45 +} + +multipath_ipv6_l4() +{ + log_info "Running IPv6 L4 hash multipath tests" + multipath6_l4_test "ECMP" 1 1 + multipath6_l4_test "Weighted MP 2:1" 2 1 + multipath6_l4_test "Weighted MP 11:45" 11 45 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index be71012b8fc5..42e28c983d41 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -353,6 +353,11 @@ wait_for_offload() "$@" | grep -q offload } +wait_for_trap() +{ + "$@" | grep -q trap +} + until_counter_is() { local expr=$1; shift @@ -767,6 +772,15 @@ rate() echo $((8 * (t1 - t0) / interval)) } +packets_rate() +{ + local t0=$1; shift + local t1=$1; shift + local interval=$1; shift + + echo $(((t1 - t0) / interval)) +} + mac_get() { local if_name=$1 diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh index c02291e9841e..880e3ab9d088 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh @@ -271,7 +271,7 @@ test_span_gre_fdb_roaming() while ((RET == 0)); do bridge fdb del dev $swp3 $h3mac vlan 555 master 2>/dev/null - bridge fdb add dev $swp2 $h3mac vlan 555 master + bridge fdb add dev $swp2 $h3mac vlan 555 master static sleep 1 fail_test_span_gre_dir $tundev ingress diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 13db1cb50e57..6406cd76a19d 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -20,6 +20,13 @@ mirror_uninstall() tc filter del dev $swp1 $direction pref 1000 } +is_ipv6() +{ + local addr=$1; shift + + [[ -z ${addr//[0-9a-fA-F:]/} ]] +} + mirror_test() { local vrf_name=$1; shift @@ -29,9 +36,17 @@ mirror_test() local pref=$1; shift local expect=$1; shift + if is_ipv6 $dip; then + local proto=-6 + local type="icmp6 type=128" # Echo request. + else + local proto= + local type="icmp echoreq" + fi + local t0=$(tc_rule_stats_get $dev $pref) - $MZ $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ - -c 10 -d 100msec -t icmp type=8 + $MZ $proto $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ + -c 10 -d 100msec -t $type sleep 0.5 local t1=$(tc_rule_stats_get $dev $pref) local delta=$((t1 - t0)) diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh new file mode 100755 index 000000000000..4898dd4118f1 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -0,0 +1,400 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + multipath_test +" +NUM_NETIFS=8 +source lib.sh + +h1_create() +{ + vrf_create "vrf-h1" + ip link set dev $h1 master vrf-h1 + + ip link set dev vrf-h1 up + ip link set dev $h1 up + + ip address add 192.0.2.2/24 dev $h1 + ip address add 2001:db8:1::2/64 dev $h1 + + ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1 + ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1 +} + +h1_destroy() +{ + ip route del 2001:db8:2::/64 vrf vrf-h1 + ip route del 198.51.100.0/24 vrf vrf-h1 + + ip address del 2001:db8:1::2/64 dev $h1 + ip address del 192.0.2.2/24 dev $h1 + + ip link set dev $h1 down + vrf_destroy "vrf-h1" +} + +h2_create() +{ + vrf_create "vrf-h2" + ip link set dev $h2 master vrf-h2 + + ip link set dev vrf-h2 up + ip link set dev $h2 up + + ip address add 198.51.100.2/24 dev $h2 + ip address add 2001:db8:2::2/64 dev $h2 + + ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1 + ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1 +} + +h2_destroy() +{ + ip route del 2001:db8:1::/64 vrf vrf-h2 + ip route del 192.0.2.0/24 vrf vrf-h2 + + ip address del 2001:db8:2::2/64 dev $h2 + ip address del 198.51.100.2/24 dev $h2 + + ip link set dev $h2 down + vrf_destroy "vrf-h2" +} + +router1_create() +{ + vrf_create "vrf-r1" + ip link set dev $rp11 master vrf-r1 + ip link set dev $rp12 master vrf-r1 + ip link set dev $rp13 master vrf-r1 + + ip link set dev vrf-r1 up + ip link set dev $rp11 up + ip link set dev $rp12 up + ip link set dev $rp13 up + + ip address add 192.0.2.1/24 dev $rp11 + ip address add 2001:db8:1::1/64 dev $rp11 + + ip address add 169.254.2.12/24 dev $rp12 + ip address add fe80:2::12/64 dev $rp12 + + ip address add 169.254.3.13/24 dev $rp13 + ip address add fe80:3::13/64 dev $rp13 +} + +router1_destroy() +{ + ip route del 2001:db8:2::/64 vrf vrf-r1 + ip route del 198.51.100.0/24 vrf vrf-r1 + + ip address del fe80:3::13/64 dev $rp13 + ip address del 169.254.3.13/24 dev $rp13 + + ip address del fe80:2::12/64 dev $rp12 + ip address del 169.254.2.12/24 dev $rp12 + + ip address del 2001:db8:1::1/64 dev $rp11 + ip address del 192.0.2.1/24 dev $rp11 + + ip nexthop del id 103 + ip nexthop del id 101 + ip nexthop del id 102 + ip nexthop del id 106 + ip nexthop del id 104 + ip nexthop del id 105 + + ip link set dev $rp13 down + ip link set dev $rp12 down + ip link set dev $rp11 down + + vrf_destroy "vrf-r1" +} + +router2_create() +{ + vrf_create "vrf-r2" + ip link set dev $rp21 master vrf-r2 + ip link set dev $rp22 master vrf-r2 + ip link set dev $rp23 master vrf-r2 + + ip link set dev vrf-r2 up + ip link set dev $rp21 up + ip link set dev $rp22 up + ip link set dev $rp23 up + + ip address add 198.51.100.1/24 dev $rp21 + ip address add 2001:db8:2::1/64 dev $rp21 + + ip address add 169.254.2.22/24 dev $rp22 + ip address add fe80:2::22/64 dev $rp22 + + ip address add 169.254.3.23/24 dev $rp23 + ip address add fe80:3::23/64 dev $rp23 +} + +router2_destroy() +{ + ip route del 2001:db8:1::/64 vrf vrf-r2 + ip route del 192.0.2.0/24 vrf vrf-r2 + + ip address del fe80:3::23/64 dev $rp23 + ip address del 169.254.3.23/24 dev $rp23 + + ip address del fe80:2::22/64 dev $rp22 + ip address del 169.254.2.22/24 dev $rp22 + + ip address del 2001:db8:2::1/64 dev $rp21 + ip address del 198.51.100.1/24 dev $rp21 + + ip nexthop del id 201 + ip nexthop del id 202 + ip nexthop del id 204 + ip nexthop del id 205 + + ip link set dev $rp23 down + ip link set dev $rp22 down + ip link set dev $rp21 down + + vrf_destroy "vrf-r2" +} + +routing_nh_obj() +{ + ip nexthop add id 101 via 169.254.2.22 dev $rp12 + ip nexthop add id 102 via 169.254.3.23 dev $rp13 + ip nexthop add id 103 group 101/102 type resilient buckets 512 \ + idle_timer 0 + ip route add 198.51.100.0/24 vrf vrf-r1 nhid 103 + + ip nexthop add id 104 via fe80:2::22 dev $rp12 + ip nexthop add id 105 via fe80:3::23 dev $rp13 + ip nexthop add id 106 group 104/105 type resilient buckets 512 \ + idle_timer 0 + ip route add 2001:db8:2::/64 vrf vrf-r1 nhid 106 + + ip nexthop add id 201 via 169.254.2.12 dev $rp22 + ip nexthop add id 202 via 169.254.3.13 dev $rp23 + ip nexthop add id 203 group 201/202 type resilient buckets 512 \ + idle_timer 0 + ip route add 192.0.2.0/24 vrf vrf-r2 nhid 203 + + ip nexthop add id 204 via fe80:2::12 dev $rp22 + ip nexthop add id 205 via fe80:3::13 dev $rp23 + ip nexthop add id 206 group 204/205 type resilient buckets 512 \ + idle_timer 0 + ip route add 2001:db8:1::/64 vrf vrf-r2 nhid 206 +} + +multipath4_test() +{ + local desc="$1" + local weight_rp12=$2 + local weight_rp13=$3 + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 + local packets_rp12 packets_rp13 + + # Transmit multiple flows from h1 to h2 and make sure they are + # distributed between both multipath links (rp12 and rp13) + # according to the provided weights. + sysctl_set net.ipv4.fib_multipath_hash_policy 1 + + t0_rp12=$(link_stats_tx_packets_get $rp12) + t0_rp13=$(link_stats_tx_packets_get $rp13) + + ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + t1_rp12=$(link_stats_tx_packets_get $rp12) + t1_rp13=$(link_stats_tx_packets_get $rp13) + + let "packets_rp12 = $t1_rp12 - $t0_rp12" + let "packets_rp13 = $t1_rp13 - $t0_rp13" + multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13 + + # Restore settings. + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +multipath6_l4_test() +{ + local desc="$1" + local weight_rp12=$2 + local weight_rp13=$3 + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 + local packets_rp12 packets_rp13 + + # Transmit multiple flows from h1 to h2 and make sure they are + # distributed between both multipath links (rp12 and rp13) + # according to the provided weights. + sysctl_set net.ipv6.fib_multipath_hash_policy 1 + + t0_rp12=$(link_stats_tx_packets_get $rp12) + t0_rp13=$(link_stats_tx_packets_get $rp13) + + $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + t1_rp12=$(link_stats_tx_packets_get $rp12) + t1_rp13=$(link_stats_tx_packets_get $rp13) + + let "packets_rp12 = $t1_rp12 - $t0_rp12" + let "packets_rp13 = $t1_rp13 - $t0_rp13" + multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13 + + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +multipath_test() +{ + # Without an idle timer, weight replacement should happen immediately. + log_info "Running multipath tests without an idle timer" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 0 + ip nexthop replace id 106 group 104/105 type resilient idle_timer 0 + + log_info "Running IPv4 multipath tests" + ip nexthop replace id 103 group 101,1/102,1 type resilient + multipath4_test "ECMP" 1 1 + ip nexthop replace id 103 group 101,2/102,1 type resilient + multipath4_test "Weighted MP 2:1" 2 1 + ip nexthop replace id 103 group 101,11/102,45 type resilient + multipath4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104,1/105,1 type resilient + multipath6_l4_test "ECMP" 1 1 + ip nexthop replace id 106 group 104,2/105,1 type resilient + multipath6_l4_test "Weighted MP 2:1" 2 1 + ip nexthop replace id 106 group 104,11/105,45 type resilient + multipath6_l4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 106 group 104,1/105,1 type resilient + + # With an idle timer, weight replacement should not happen, so the + # expected ratio should always be the initial one (1:1). + log_info "Running multipath tests with an idle timer of 120 seconds" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 120 + ip nexthop replace id 106 group 104/105 type resilient idle_timer 120 + + log_info "Running IPv4 multipath tests" + ip nexthop replace id 103 group 101,1/102,1 type resilient + multipath4_test "ECMP" 1 1 + ip nexthop replace id 103 group 101,2/102,1 type resilient + multipath4_test "Weighted MP 2:1" 1 1 + ip nexthop replace id 103 group 101,11/102,45 type resilient + multipath4_test "Weighted MP 11:45" 1 1 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104,1/105,1 type resilient + multipath6_l4_test "ECMP" 1 1 + ip nexthop replace id 106 group 104,2/105,1 type resilient + multipath6_l4_test "Weighted MP 2:1" 1 1 + ip nexthop replace id 106 group 104,11/105,45 type resilient + multipath6_l4_test "Weighted MP 11:45" 1 1 + + ip nexthop replace id 106 group 104,1/105,1 type resilient + + # With a short idle timer and enough idle time, weight replacement + # should happen. + log_info "Running multipath tests with an idle timer of 5 seconds" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 5 + ip nexthop replace id 106 group 104/105 type resilient idle_timer 5 + + log_info "Running IPv4 multipath tests" + sleep 10 + ip nexthop replace id 103 group 101,1/102,1 type resilient + multipath4_test "ECMP" 1 1 + sleep 10 + ip nexthop replace id 103 group 101,2/102,1 type resilient + multipath4_test "Weighted MP 2:1" 2 1 + sleep 10 + ip nexthop replace id 103 group 101,11/102,45 type resilient + multipath4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running IPv6 L4 hash multipath tests" + sleep 10 + ip nexthop replace id 106 group 104,1/105,1 type resilient + multipath6_l4_test "ECMP" 1 1 + sleep 10 + ip nexthop replace id 106 group 104,2/105,1 type resilient + multipath6_l4_test "Weighted MP 2:1" 2 1 + sleep 10 + ip nexthop replace id 106 group 104,11/105,45 type resilient + multipath6_l4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 106 group 104,1/105,1 type resilient +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp11=${NETIFS[p2]} + + rp12=${NETIFS[p3]} + rp22=${NETIFS[p4]} + + rp13=${NETIFS[p5]} + rp23=${NETIFS[p6]} + + rp21=${NETIFS[p7]} + h2=${NETIFS[p8]} + + vrf_prepare + + h1_create + h2_create + + router1_create + router2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + router2_destroy + router1_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 198.51.100.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +ip nexthop ls >/dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "Nexthop objects not supported; skipping tests" + exit 0 +fi + +trap cleanup EXIT + +setup_prepare +setup_wait +routing_nh_obj + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh index 160f9cccdfb7..4f9f17cb45d6 100755 --- a/tools/testing/selftests/net/forwarding/tc_police.sh +++ b/tools/testing/selftests/net/forwarding/tc_police.sh @@ -35,6 +35,8 @@ ALL_TESTS=" police_shared_test police_rx_mirror_test police_tx_mirror_test + police_pps_rx_test + police_pps_tx_test " NUM_NETIFS=6 source tc_common.sh @@ -290,6 +292,60 @@ police_tx_mirror_test() police_mirror_common_test $rp2 egress "police tx and mirror" } +police_pps_common_test() +{ + local test_name=$1; shift + + RET=0 + + # Rule to measure bandwidth on ingress of $h2 + tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action drop + + mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \ + -t udp sp=12345,dp=54321 -p 1000 -c 0 -q & + + local t0=$(tc_rule_stats_get $h2 1 ingress .packets) + sleep 10 + local t1=$(tc_rule_stats_get $h2 1 ingress .packets) + + local er=$((2000)) + local nr=$(packets_rate $t0 $t1 10) + local nr_pct=$((100 * (nr - er) / er)) + ((-10 <= nr_pct && nr_pct <= 10)) + check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%." + + log_test "$test_name" + + { kill %% && wait %%; } 2>/dev/null + tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower +} + +police_pps_rx_test() +{ + # Rule to police traffic destined to $h2 on ingress of $rp1 + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action police pkts_rate 2000 pkts_burst 400 conform-exceed drop/ok + + police_pps_common_test "police pps on rx" + + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower +} + +police_pps_tx_test() +{ + # Rule to police traffic destined to $h2 on egress of $rp2 + tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action police pkts_rate 2000 pkts_burst 400 conform-exceed drop/ok + + police_pps_common_test "police pps on tx" + + tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower +} + setup_prepare() { h1=${NETIFS[p1]} diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index ce6bea9675c0..eb307ca37bfa 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -657,10 +657,21 @@ test_ecn_decap() { # In accordance with INET_ECN_decapsulate() __test_ecn_decap 00 00 0x00 + __test_ecn_decap 00 01 0x00 + __test_ecn_decap 00 02 0x00 + # 00 03 is tested in test_ecn_decap_error() + __test_ecn_decap 01 00 0x01 __test_ecn_decap 01 01 0x01 - __test_ecn_decap 02 01 0x02 + __test_ecn_decap 01 02 0x01 __test_ecn_decap 01 03 0x03 + __test_ecn_decap 02 00 0x02 + __test_ecn_decap 02 01 0x01 + __test_ecn_decap 02 02 0x02 __test_ecn_decap 02 03 0x03 + __test_ecn_decap 03 00 0x03 + __test_ecn_decap 03 01 0x03 + __test_ecn_decap 03 02 0x03 + __test_ecn_decap 03 03 0x03 test_ecn_decap_error } diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 00bb158b4a5d..f1464f09b080 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -6,7 +6,7 @@ KSFT_KHDR_INSTALL := 1 CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ - simult_flows.sh + simult_flows.sh mptcp_sockopt.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 39edce4f541c..2674ba20d524 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -5,8 +5,9 @@ rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns="ns1-$rndh" ksft_skip=4 test_cnt=1 +timeout_poll=100 +timeout_test=$((timeout_poll * 2 + 1)) ret=0 -pids=() flush_pids() { @@ -14,18 +15,14 @@ flush_pids() # give it some time sleep 1.1 - for pid in ${pids[@]}; do - [ -d /proc/$pid ] && kill -SIGUSR1 $pid >/dev/null 2>&1 - done - pids=() + ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGUSR1 &>/dev/null } cleanup() { + ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null + ip netns del $ns - for pid in ${pids[@]}; do - [ -d /proc/$pid ] && kill -9 $pid >/dev/null 2>&1 - done } ip -Version > /dev/null 2>&1 @@ -79,39 +76,57 @@ trap cleanup EXIT ip netns add $ns ip -n $ns link set dev lo up -echo "a" | ip netns exec $ns ./mptcp_connect -p 10000 -l 0.0.0.0 -t 100 >/dev/null & +echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10000 -l -t ${timeout_poll} \ + 0.0.0.0 >/dev/null & sleep 0.1 -pids[0]=$! chk_msk_nr 0 "no msk on netns creation" -echo "b" | ip netns exec $ns ./mptcp_connect -p 10000 127.0.0.1 -j -t 100 >/dev/null & +echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10000 -j -t ${timeout_poll} \ + 127.0.0.1 >/dev/null & sleep 0.1 -pids[1]=$! chk_msk_nr 2 "after MPC handshake " chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" flush_pids -echo "a" | ip netns exec $ns ./mptcp_connect -p 10001 -s TCP -l 0.0.0.0 -t 100 >/dev/null & -pids[0]=$! +echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \ + 0.0.0.0 >/dev/null & sleep 0.1 -echo "b" | ip netns exec $ns ./mptcp_connect -p 10001 127.0.0.1 -j -t 100 >/dev/null & -pids[1]=$! +echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10001 -j -t ${timeout_poll} \ + 127.0.0.1 >/dev/null & sleep 0.1 chk_msk_fallback_nr 1 "check fallback" flush_pids NR_CLIENTS=100 for I in `seq 1 $NR_CLIENTS`; do - echo "a" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) -l 0.0.0.0 -t 100 -w 10 >/dev/null & - pids[$((I*2))]=$! + echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p $((I+10001)) -l -w 10 \ + -t ${timeout_poll} 0.0.0.0 >/dev/null & done sleep 0.1 for I in `seq 1 $NR_CLIENTS`; do - echo "b" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) 127.0.0.1 -t 100 -w 10 >/dev/null & - pids[$((I*2 + 1))]=$! + echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p $((I+10001)) -w 10 \ + -t ${timeout_poll} 127.0.0.1 >/dev/null & done sleep 1.5 diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 77bb62feb872..d88e1fdfb147 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -45,7 +45,14 @@ enum cfg_mode { CFG_MODE_SENDFILE, }; +enum cfg_peek { + CFG_NONE_PEEK, + CFG_WITH_PEEK, + CFG_AFTER_PEEK, +}; + static enum cfg_mode cfg_mode = CFG_MODE_POLL; +static enum cfg_peek cfg_peek = CFG_NONE_PEEK; static const char *cfg_host; static const char *cfg_port = "12000"; static int cfg_sock_proto = IPPROTO_MPTCP; @@ -55,7 +62,9 @@ static int cfg_sndbuf; static int cfg_rcvbuf; static bool cfg_join; static bool cfg_remove; +static unsigned int cfg_do_w; static int cfg_wait; +static uint32_t cfg_mark; static void die_usage(void) { @@ -68,8 +77,11 @@ static void die_usage(void) fprintf(stderr, "\t-p num -- use port num\n"); fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n"); fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); + fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-u -- check mptcp ulp\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); + fprintf(stderr, + "\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n"); exit(1); } @@ -139,6 +151,17 @@ static void set_sndbuf(int fd, unsigned int size) } } +static void set_mark(int fd, uint32_t mark) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); + if (err) { + perror("set SO_MARK"); + exit(1); + } +} + static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { @@ -247,6 +270,9 @@ static int sock_connect_mptcp(const char * const remoteaddr, continue; } + if (cfg_mark) + set_mark(sock, cfg_mark); + if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) break; /* success */ @@ -272,8 +298,8 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) if (cfg_join && first && do_w > 100) do_w = 100; - if (cfg_remove && do_w > 50) - do_w = 50; + if (cfg_remove && do_w > cfg_do_w) + do_w = cfg_do_w; bw = write(fd, buf, do_w); if (bw < 0) @@ -314,6 +340,8 @@ static size_t do_write(const int fd, char *buf, const size_t len) static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) { + int ret = 0; + char tmp[16384]; size_t cap = rand(); cap &= 0xffff; @@ -323,7 +351,17 @@ static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) else if (cap > len) cap = len; - return read(fd, buf, cap); + if (cfg_peek == CFG_WITH_PEEK) { + ret = recv(fd, buf, cap, MSG_PEEK); + ret = (ret < 0) ? ret : read(fd, tmp, ret); + } else if (cfg_peek == CFG_AFTER_PEEK) { + ret = recv(fd, buf, cap, MSG_PEEK); + ret = (ret < 0) ? ret : read(fd, buf, cap); + } else { + ret = read(fd, buf, cap); + } + + return ret; } static void set_nonblock(int fd) @@ -802,6 +840,26 @@ int parse_mode(const char *mode) return 0; } +int parse_peek(const char *mode) +{ + if (!strcasecmp(mode, "saveWithPeek")) + return CFG_WITH_PEEK; + if (!strcasecmp(mode, "saveAfterPeek")) + return CFG_AFTER_PEEK; + + fprintf(stderr, "Unknown: %s\n", mode); + fprintf(stderr, "Supported MSG_PEEK mode are:\n"); + fprintf(stderr, + "\t\t\"saveWithPeek\" - recv data with flags 'MSG_PEEK' and save the peek data into file\n"); + fprintf(stderr, + "\t\t\"saveAfterPeek\" - read and save data into file after recv with flags 'MSG_PEEK'\n"); + + die_usage(); + + /* silence compiler warning */ + return 0; +} + static int parse_int(const char *size) { unsigned long s; @@ -829,7 +887,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jrlp:s:hut:m:S:R:w:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:P:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -840,6 +898,9 @@ static void parse_opts(int argc, char **argv) cfg_remove = true; cfg_mode = CFG_MODE_POLL; cfg_wait = 400000; + cfg_do_w = atoi(optarg); + if (cfg_do_w <= 0) + cfg_do_w = 50; break; case 'l': listen_mode = true; @@ -876,6 +937,12 @@ static void parse_opts(int argc, char **argv) case 'w': cfg_wait = atoi(optarg)*1000000; break; + case 'M': + cfg_mark = strtol(optarg, NULL, 0); + break; + case 'P': + cfg_peek = parse_peek(optarg); + break; } } @@ -907,6 +974,8 @@ int main(int argc, char *argv[]) set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); + if (cfg_mark) + set_mark(fd, cfg_mark); return main_loop_s(fd); } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 10a030b53b23..9236609731b1 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -11,7 +11,8 @@ cin="" cout="" ksft_skip=4 capture=false -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) ipv6=true ethtool_random_on=true tc_delay="$((RANDOM%50))" @@ -273,7 +274,7 @@ check_mptcp_disabled() ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 local err=0 - LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ + LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ grep -q "^socket: Protocol not available$" && err=1 ip netns delete ${disabled_ns} @@ -374,7 +375,7 @@ do_transfer() local srv_proto="$4" local connect_addr="$5" local local_addr="$6" - local extra_args="" + local extra_args="$7" local port port=$((10000+$TEST_COUNT)) @@ -393,9 +394,9 @@ do_transfer() fi if [ -n "$extra_args" ] && $options_log; then - options_log=false echo "INFO: extra options: $extra_args" fi + options_log=false :> "$cout" :> "$sout" @@ -425,19 +426,32 @@ do_transfer() sleep 1 fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat -n + if [ ${listener_ns} != ${connector_ns} ]; then + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat -n + fi + local stat_synrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") local stat_ackrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") local stat_cookietx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesSent") local stat_cookierx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesRecv") - ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + $extra_args $local_addr < "$sin" > "$sout" & local spid=$! wait_local_port_listen "${listener_ns}" "${port}" local start start=$(date +%s%3N) - ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $extra_args $connect_addr < "$cin" > "$cout" & local cpid=$! wait $cpid @@ -575,6 +589,7 @@ run_tests_lo() local connector_ns="$2" local connect_addr="$3" local loopback="$4" + local extra_args="$5" local lret=0 # skip if test programs are running inside same netns for subsequent runs. @@ -594,7 +609,8 @@ run_tests_lo() local_addr="0.0.0.0" fi - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -608,14 +624,16 @@ run_tests_lo() fi fi - do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} MPTCP TCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret return 1 fi - do_transfer ${listener_ns} ${connector_ns} TCP MPTCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} TCP MPTCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -623,7 +641,8 @@ run_tests_lo() fi if [ $do_tcp -gt 1 ] ;then - do_transfer ${listener_ns} ${connector_ns} TCP TCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} TCP TCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -639,6 +658,15 @@ run_tests() run_tests_lo $1 $2 $3 0 } +run_tests_peekmode() +{ + local peekmode="$1" + + echo "INFO: with peek mode: ${peekmode}" + run_tests_lo "$ns1" "$ns1" 10.0.1.1 1 "-P ${peekmode}" + run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}" +} + make_file "$cin" "client" make_file "$sin" "server" @@ -718,6 +746,9 @@ for sender in $ns1 $ns2 $ns3 $ns4;do run_tests "$ns4" $sender dead:beef:3::1 done +run_tests_peekmode "saveWithPeek" +run_tests_peekmode "saveAfterPeek" + time_end=$(date +%s) time_run=$((time_end-time_start)) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 964db9ed544f..fd99485cf2a4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,9 +8,11 @@ cin="" cinsent="" cout="" ksft_skip=4 -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) mptcp_connect="" capture=0 +do_all_tests=1 TEST_COUNT=0 @@ -76,6 +78,7 @@ cleanup_partial() for netns in "$ns1" "$ns2"; do ip netns del $netns + rm -f /tmp/$netns.{nstat,out} done } @@ -121,12 +124,6 @@ reset_with_add_addr_timeout() -j DROP } -for arg in "$@"; do - if [ "$arg" = "-c" ]; then - capture=1 - fi -done - ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" @@ -237,10 +234,17 @@ do_transfer() sleep 1 fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat -n + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat -n + if [ $speed = "fast" ]; then mptcp_connect="./mptcp_connect -j" - else - mptcp_connect="./mptcp_connect -r" + elif [ $speed = "slow" ]; then + mptcp_connect="./mptcp_connect -r 50" + elif [ $speed = "least" ]; then + mptcp_connect="./mptcp_connect -r 10" fi local local_addr @@ -250,17 +254,26 @@ do_transfer() local_addr="0.0.0.0" fi - ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port \ - -s ${srv_proto} ${local_addr} < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & spid=$! sleep 1 if [ "$test_link_fail" -eq 0 ];then - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $connect_addr < "$cin" > "$cout" & else - ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | tee "$cinsent" | \ - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr > "$cout" & + ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \ + tee "$cinsent" | \ + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $connect_addr > "$cout" & fi cpid=$! @@ -284,17 +297,25 @@ do_transfer() let rm_nr_ns1=-addr_nr_ns1 if [ $rm_nr_ns1 -lt 8 ]; then counter=1 - sleep 1 - - while [ $counter -le $rm_nr_ns1 ] - do - ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`) + if [ ${#dump[@]} -gt 0 ]; then + id=${dump[1]} sleep 1 - let counter+=1 - done - else + + while [ $counter -le $rm_nr_ns1 ] + do + ip netns exec ${listener_ns} ./pm_nl_ctl del $id + sleep 1 + let counter+=1 + let id+=1 + done + fi + elif [ $rm_nr_ns1 -eq 8 ]; then sleep 1 ip netns exec ${listener_ns} ./pm_nl_ctl flush + elif [ $rm_nr_ns1 -eq 9 ]; then + sleep 1 + ip netns exec ${listener_ns} ./pm_nl_ctl del 0 ${connect_addr} fi fi @@ -318,17 +339,31 @@ do_transfer() let rm_nr_ns2=-addr_nr_ns2 if [ $rm_nr_ns2 -lt 8 ]; then counter=1 - sleep 1 - - while [ $counter -le $rm_nr_ns2 ] - do - ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`) + if [ ${#dump[@]} -gt 0 ]; then + id=${dump[1]} sleep 1 - let counter+=1 - done - else + + while [ $counter -le $rm_nr_ns2 ] + do + ip netns exec ${connector_ns} ./pm_nl_ctl del $id + sleep 1 + let counter+=1 + let id+=1 + done + fi + elif [ $rm_nr_ns2 -eq 8 ]; then sleep 1 ip netns exec ${connector_ns} ./pm_nl_ctl flush + elif [ $rm_nr_ns2 -eq 9 ]; then + local addr + if is_v6 "${connect_addr}"; then + addr="dead:beef:1::2" + else + addr="10.0.1.2" + fi + sleep 1 + ip netns exec ${connector_ns} ./pm_nl_ctl del 0 $addr fi fi @@ -354,12 +389,19 @@ do_transfer() kill $cappid fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat | grep Tcp > /tmp/${listener_ns}.out + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat | grep Tcp > /tmp/${connector_ns}.out + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then echo " client exit code $retc, server $rets" 1>&2 echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2 - ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port" + cat /tmp/${listener_ns}.out echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2 - ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" + cat /tmp/${connector_ns}.out cat "$capout" ret=1 @@ -610,11 +652,22 @@ chk_rm_nr() { local rm_addr_nr=$1 local rm_subflow_nr=$2 + local invert=${3:-""} local count local dump_stats + local addr_ns + local subflow_ns + + if [ -z $invert ]; then + addr_ns=$ns1 + subflow_ns=$ns2 + elif [ $invert = "invert" ]; then + addr_ns=$ns2 + subflow_ns=$ns1 + fi printf "%-39s %s" " " "rm " - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'` + count=`ip netns exec $addr_ns nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'` [ -z "$count" ] && count=0 if [ "$count" != "$rm_addr_nr" ]; then echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr" @@ -625,7 +678,7 @@ chk_rm_nr() fi echo -n " - sf " - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'` + count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'` [ -z "$count" ] && count=0 if [ "$count" != "$rm_subflow_nr" ]; then echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr" @@ -724,6 +777,14 @@ subflows_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows, limited by server" 2 2 1 + + # single subflow, dev + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow dev ns2eth3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "single subflow, dev" 1 1 1 } signal_address_tests() @@ -767,6 +828,28 @@ signal_address_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows and signal" 3 3 3 chk_add_nr 1 1 + + # signal addresses + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal addresses" 3 3 3 + chk_add_nr 3 3 + + # signal invalid addresses + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal invalid addresses" 1 1 1 + chk_add_nr 3 3 } link_failure_tests() @@ -802,6 +885,26 @@ add_addr_timeout_tests() run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow chk_join_nr "signal address, ADD_ADDR6 timeout" 1 1 1 chk_add_nr 4 0 + + # signal addresses timeout + reset_with_add_addr_timeout + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 least + chk_join_nr "signal addresses, ADD_ADDR timeout" 2 2 2 + chk_add_nr 8 0 + + # signal invalid addresses timeout + reset_with_add_addr_timeout + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 least + chk_join_nr "invalid address, ADD_ADDR timeout" 1 1 1 + chk_add_nr 8 0 } remove_tests() @@ -833,7 +936,7 @@ remove_tests() run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow chk_join_nr "remove single address" 1 1 1 chk_add_nr 1 1 - chk_rm_nr 0 0 + chk_rm_nr 1 1 invert # subflow and signal, remove reset @@ -858,6 +961,30 @@ remove_tests() chk_add_nr 1 1 chk_rm_nr 2 2 + # addresses remove + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + chk_join_nr "remove addresses" 3 3 3 + chk_add_nr 3 3 + chk_rm_nr 3 3 invert + + # invalid addresses remove + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + chk_join_nr "remove invalid addresses" 1 1 1 + chk_add_nr 3 3 + chk_rm_nr 3 1 invert + # subflows and signal, flush reset ip netns exec $ns1 ./pm_nl_ctl limits 0 3 @@ -869,6 +996,60 @@ remove_tests() chk_join_nr "flush subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 + + # subflows flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow id 150 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow + chk_join_nr "flush subflows" 3 3 3 + chk_rm_nr 3 3 + + # addresses flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow + chk_join_nr "flush addresses" 3 3 3 + chk_add_nr 3 3 + chk_rm_nr 3 3 invert + + # invalid addresses flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -8 0 slow + chk_join_nr "flush invalid addresses" 1 1 1 + chk_add_nr 3 3 + chk_rm_nr 3 1 invert + + # remove id 0 subflow + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 0 -9 slow + chk_join_nr "remove id 0 subflow" 1 1 1 + chk_rm_nr 1 1 + + # remove id 0 address + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + run_tests $ns1 $ns2 10.0.1.1 0 -9 0 slow + chk_join_nr "remove id 0 address" 1 1 1 + chk_add_nr 1 1 + chk_rm_nr 1 1 invert } add_tests() @@ -945,7 +1126,7 @@ ipv6_tests() run_tests $ns1 $ns2 dead:beef:1::1 0 -1 0 slow chk_join_nr "remove single address IPv6" 1 1 1 chk_add_nr 1 1 - chk_rm_nr 0 0 + chk_rm_nr 1 1 invert # subflow and signal IPv6, remove reset @@ -1088,7 +1269,7 @@ add_addr_ports_tests() run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow chk_join_nr "remove single address with port" 1 1 1 chk_add_nr 1 1 1 - chk_rm_nr 0 0 + chk_rm_nr 1 1 invert # subflow and signal with port, remove reset @@ -1221,7 +1402,8 @@ usage() echo " -4 v4mapped_tests" echo " -b backup_tests" echo " -p add_addr_ports_tests" - echo " -c syncookies_tests" + echo " -k syncookies_tests" + echo " -c capture pcap files" echo " -h help" } @@ -1235,12 +1417,24 @@ make_file "$cin" "client" 1 make_file "$sin" "server" 1 trap cleanup EXIT -if [ -z $1 ]; then +for arg in "$@"; do + # check for "capture" arg before launching tests + if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then + capture=1 + fi + + # exception for the capture option, the rest means: a part of the tests + if [ "${arg}" != "-c" ]; then + do_all_tests=0 + fi +done + +if [ $do_all_tests -eq 1 ]; then all_tests exit $ret fi -while getopts 'fsltra64bpch' opt; do +while getopts 'fsltra64bpkch' opt; do case $opt in f) subflows_tests @@ -1272,9 +1466,11 @@ while getopts 'fsltra64bpch' opt; do p) add_addr_ports_tests ;; - c) + k) syncookies_tests ;; + c) + ;; h | *) usage ;; diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh new file mode 100755 index 000000000000..2fa13946ac04 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -0,0 +1,276 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) +mptcp_connect="" +do_all_tests=1 + +add_mark_rules() +{ + local ns=$1 + local m=$2 + + for t in iptables ip6tables; do + # just to debug: check we have multiple subflows connection requests + ip netns exec $ns $t -A OUTPUT -p tcp --syn -m mark --mark $m -j ACCEPT + + # RST packets might be handled by a internal dummy socket + ip netns exec $ns $t -A OUTPUT -p tcp --tcp-flags RST RST -m mark --mark 0 -j ACCEPT + + ip netns exec $ns $t -A OUTPUT -p tcp -m mark --mark $m -j ACCEPT + ip netns exec $ns $t -A OUTPUT -p tcp -m mark --mark 0 -j DROP + done +} + +init() +{ + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) + + ns1="ns1-$rndh" + ns2="ns2-$rndh" + + for netns in "$ns1" "$ns2";do + ip netns add $netns || exit $ksft_skip + ip -net $netns link set lo up + ip netns exec $netns sysctl -q net.mptcp.enabled=1 + ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + done + + for i in `seq 1 4`; do + ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" + ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i + ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad + ip -net "$ns1" link set ns1eth$i up + + ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i + ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad + ip -net "$ns2" link set ns2eth$i up + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + + ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + + ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + done + + ip netns exec $ns1 ./pm_nl_ctl limits 8 8 + ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + + add_mark_rules $ns1 1 + add_mark_rules $ns2 2 +} + +cleanup() +{ + for netns in "$ns1" "$ns2"; do + ip netns del $netns + done + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +iptables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without iptables tool" + exit $ksft_skip +fi + +ip6tables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without ip6tables tool" + exit $ksft_skip +fi + +check_mark() +{ + local ns=$1 + local af=$2 + + tables=iptables + + if [ $af -eq 6 ];then + tables=ip6tables + fi + + counters=$(ip netns exec $ns $tables -v -L OUTPUT | grep DROP) + values=${counters%DROP*} + + for v in $values; do + if [ $v -ne 0 ]; then + echo "FAIL: got $tables $values in ns $ns , not 0 - not all expected packets marked" 1>&2 + return 1 + fi + done + + return 0 +} + +print_file_err() +{ + ls -l "$1" 1>&2 + echo "Trailing bytes are: " + tail -c 27 "$1" +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + ret=1 + + return 1 + fi + + return 0 +} + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_transfer() +{ + listener_ns="$1" + connector_ns="$2" + cl_proto="$3" + srv_proto="$4" + connect_addr="$5" + + port=12001 + + :> "$cout" + :> "$sout" + + mptcp_connect="./mptcp_connect -r 20" + + local local_addr + if is_v6 "${connect_addr}"; then + local_addr="::" + else + local_addr="0.0.0.0" + fi + + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & + spid=$! + + sleep 1 + + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} \ + $connect_addr < "$cin" > "$cout" & + + cpid=$! + + wait $cpid + retc=$? + wait $spid + rets=$? + + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " client exit code $retc, server $rets" 1>&2 + echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2 + ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port" + + echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2 + ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" + + ret=1 + return 1 + fi + + if [ $local_addr = "::" ];then + check_mark $listener_ns 6 + check_mark $connector_ns 6 + else + check_mark $listener_ns 4 + check_mark $connector_ns 4 + fi + + check_transfer $cin $sout "file received by server" + + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + return 0 + fi + + return 1 +} + +make_file() +{ + name=$1 + who=$2 + size=$3 + + dd if=/dev/urandom of="$name" bs=1024 count=$size 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $size KB) containing data sent by $who" +} + +run_tests() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + lret=0 + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} + + lret=$? + + if [ $lret -ne 0 ]; then + ret=$lret + return + fi +} + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +init +make_file "$cin" "client" 1 +make_file "$sin" "server" 1 +trap cleanup EXIT + +run_tests $ns1 $ns2 10.0.1.1 +run_tests $ns1 $ns2 dead:beef:1::1 + + +if [ $ret -eq 0 ];then + echo "PASS: all packets had packet mark set" +fi + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index a617e293734c..3c741abe034e 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -100,12 +100,12 @@ done check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" -for i in `seq 9 256`; do +ip netns exec $ns1 ./pm_nl_ctl del 9 +for i in `seq 10 255`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i ip netns exec $ns1 ./pm_nl_ctl del $i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $((i+1)) done check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.0.9 id 3 flags signal,backup 10.0.1.3 id 4 flags signal 10.0.1.4 id 5 flags signal 10.0.1.5 diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7b4167f3f9a2..115decfdc1ef 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -26,7 +26,7 @@ static void syntax(char *argv[]) { fprintf(stderr, "%s add|get|set|del|flush|dump|accept [<args>]\n", argv[0]); fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n"); - fprintf(stderr, "\tdel <id>\n"); + fprintf(stderr, "\tdel <id> [<ip>]\n"); fprintf(stderr, "\tget <id>\n"); fprintf(stderr, "\tset <ip> [flags backup|nobackup]\n"); fprintf(stderr, "\tflush\n"); @@ -301,6 +301,7 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) 1024]; struct rtattr *rta, *nest; struct nlmsghdr *nh; + u_int16_t family; int nest_start; u_int8_t id; int off = 0; @@ -310,11 +311,14 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, MPTCP_PM_VER); - /* the only argument is the address id */ - if (argc != 3) + /* the only argument is the address id (nonzero) */ + if (argc != 3 && argc != 4) syntax(argv); id = atoi(argv[2]); + /* zero id with the IP address */ + if (!id && argc != 4) + syntax(argv); nest_start = off; nest = (void *)(data + off); @@ -328,6 +332,30 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) rta->rta_len = RTA_LENGTH(1); memcpy(RTA_DATA(rta), &id, 1); off += NLMSG_ALIGN(rta->rta_len); + + if (!id) { + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[3], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[3], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else { + error(1, errno, "can't parse ip %s", argv[3]); + } + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + } nest->rta_len = off - nest_start; do_nl_req(fd, nh, off, 0); diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index f039ee57eb3c..3aeef3bcb101 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -7,7 +7,8 @@ ns2="ns2-$rndh" ns3="ns3-$rndh" capture=false ksft_skip=4 -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) test_cnt=1 ret=0 bail=0 @@ -157,14 +158,20 @@ do_transfer() sleep 1 fi - ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${ns3} \ + ./mptcp_connect -jt ${timeout_poll} -l -p $port \ + 0.0.0.0 < "$sin" > "$sout" & local spid=$! wait_local_port_listen "${ns3}" "${port}" local start start=$(date +%s%3N) - ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${ns1} \ + ./mptcp_connect -jt ${timeout_poll} -p $port \ + 10.0.3.3 < "$cin" > "$cout" & local cpid=$! wait $cpid diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c index 7b01b7c2ec10..066efd30e294 100644 --- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -30,25 +30,25 @@ struct reuse_opts { }; struct reuse_opts unreusable_opts[12] = { - {0, 0, 0, 0}, - {0, 0, 0, 1}, - {0, 0, 1, 0}, - {0, 0, 1, 1}, - {0, 1, 0, 0}, - {0, 1, 0, 1}, - {0, 1, 1, 0}, - {0, 1, 1, 1}, - {1, 0, 0, 0}, - {1, 0, 0, 1}, - {1, 0, 1, 0}, - {1, 0, 1, 1}, + {{0, 0}, {0, 0}}, + {{0, 0}, {0, 1}}, + {{0, 0}, {1, 0}}, + {{0, 0}, {1, 1}}, + {{0, 1}, {0, 0}}, + {{0, 1}, {0, 1}}, + {{0, 1}, {1, 0}}, + {{0, 1}, {1, 1}}, + {{1, 0}, {0, 0}}, + {{1, 0}, {0, 1}}, + {{1, 0}, {1, 0}}, + {{1, 0}, {1, 1}}, }; struct reuse_opts reusable_opts[4] = { - {1, 1, 0, 0}, - {1, 1, 0, 1}, - {1, 1, 1, 0}, - {1, 1, 1, 1}, + {{1, 1}, {0, 0}}, + {{1, 1}, {0, 1}}, + {{1, 1}, {1, 0}}, + {{1, 1}, {1, 1}}, }; int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport) diff --git a/tools/testing/selftests/net/settings b/tools/testing/selftests/net/settings new file mode 100644 index 000000000000..694d70710ff0 --- /dev/null +++ b/tools/testing/selftests/net/settings @@ -0,0 +1 @@ +timeout=300 diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c index b4cca382d125..59067f64b775 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/net/so_txtime.c @@ -2,9 +2,12 @@ /* * Test the SO_TXTIME API * - * Takes two streams of { payload, delivery time }[], one input and one output. - * Sends the input stream and verifies arrival matches the output stream. - * The two streams can differ due to out-of-order delivery and drops. + * Takes a stream of { payload, delivery time }[], to be sent across two + * processes. Start this program on two separate network namespaces or + * connected hosts, one instance in transmit mode and the other in receive + * mode using the '-r' option. Receiver will compare arrival timestamps to + * the expected stream. Sender will read transmit timestamps from the error + * queue. The streams can differ due to out-of-order delivery and drops. */ #define _GNU_SOURCE @@ -28,14 +31,17 @@ #include <sys/types.h> #include <time.h> #include <unistd.h> +#include <poll.h> static int cfg_clockid = CLOCK_TAI; -static bool cfg_do_ipv4; -static bool cfg_do_ipv6; static uint16_t cfg_port = 8000; static int cfg_variance_us = 4000; +static uint64_t cfg_start_time_ns; +static int cfg_mark; +static bool cfg_rx; static uint64_t glob_tstart; +static uint64_t tdeliver_max; /* encode one timed transmission (of a 1B payload) */ struct timed_send { @@ -44,18 +50,21 @@ struct timed_send { }; #define MAX_NUM_PKT 8 -static struct timed_send cfg_in[MAX_NUM_PKT]; -static struct timed_send cfg_out[MAX_NUM_PKT]; +static struct timed_send cfg_buf[MAX_NUM_PKT]; static int cfg_num_pkt; static int cfg_errq_level; static int cfg_errq_type; -static uint64_t gettime_ns(void) +static struct sockaddr_storage cfg_dst_addr; +static struct sockaddr_storage cfg_src_addr; +static socklen_t cfg_alen; + +static uint64_t gettime_ns(clockid_t clock) { struct timespec ts; - if (clock_gettime(cfg_clockid, &ts)) + if (clock_gettime(clock, &ts)) error(1, errno, "gettime"); return ts.tv_sec * (1000ULL * 1000 * 1000) + ts.tv_nsec; @@ -75,6 +84,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_iov = &iov; msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&cfg_dst_addr; + msg.msg_namelen = cfg_alen; if (ts->delay_us >= 0) { memset(control, 0, sizeof(control)); @@ -82,6 +93,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_controllen = sizeof(control); tdeliver = glob_tstart + ts->delay_us * 1000; + tdeliver_max = tdeliver_max > tdeliver ? + tdeliver_max : tdeliver; cm = CMSG_FIRSTHDR(&msg); cm->cmsg_level = SOL_SOCKET; @@ -98,7 +111,7 @@ static void do_send_one(int fdt, struct timed_send *ts) } -static bool do_recv_one(int fdr, struct timed_send *ts) +static void do_recv_one(int fdr, struct timed_send *ts) { int64_t tstop, texpect; char rbuf[2]; @@ -106,13 +119,13 @@ static bool do_recv_one(int fdr, struct timed_send *ts) ret = recv(fdr, rbuf, sizeof(rbuf), 0); if (ret == -1 && errno == EAGAIN) - return true; + error(1, EAGAIN, "recv: timeout"); if (ret == -1) error(1, errno, "read"); if (ret != 1) error(1, 0, "read: %dB", ret); - tstop = (gettime_ns() - glob_tstart) / 1000; + tstop = (gettime_ns(cfg_clockid) - glob_tstart) / 1000; texpect = ts->delay_us >= 0 ? ts->delay_us : 0; fprintf(stderr, "payload:%c delay:%lld expected:%lld (us)\n", @@ -123,8 +136,6 @@ static bool do_recv_one(int fdr, struct timed_send *ts) if (llabs(tstop - texpect) > cfg_variance_us) error(1, 0, "exceeds variance (%d us)", cfg_variance_us); - - return false; } static void do_recv_verify_empty(int fdr) @@ -137,18 +148,18 @@ static void do_recv_verify_empty(int fdr) error(1, 0, "recv: not empty as expected (%d, %d)", ret, errno); } -static void do_recv_errqueue_timeout(int fdt) +static int do_recv_errqueue_timeout(int fdt) { char control[CMSG_SPACE(sizeof(struct sock_extended_err)) + CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0}; char data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct udphdr) + 1]; struct sock_extended_err *err; + int ret, num_tstamp = 0; struct msghdr msg = {0}; struct iovec iov = {0}; struct cmsghdr *cm; int64_t tstamp = 0; - int ret; iov.iov_base = data; iov.iov_len = sizeof(data); @@ -206,9 +217,47 @@ static void do_recv_errqueue_timeout(int fdt) msg.msg_flags = 0; msg.msg_controllen = sizeof(control); + num_tstamp++; } - error(1, 0, "recv: timeout"); + return num_tstamp; +} + +static void recv_errqueue_msgs(int fdt) +{ + struct pollfd pfd = { .fd = fdt, .events = POLLERR }; + const int timeout_ms = 10; + int ret, num_tstamp = 0; + + do { + ret = poll(&pfd, 1, timeout_ms); + if (ret == -1) + error(1, errno, "poll"); + + if (ret && (pfd.revents & POLLERR)) + num_tstamp += do_recv_errqueue_timeout(fdt); + + if (num_tstamp == cfg_num_pkt) + break; + + } while (gettime_ns(cfg_clockid) < tdeliver_max); +} + +static void start_time_wait(void) +{ + uint64_t now; + int err; + + if (!cfg_start_time_ns) + return; + + now = gettime_ns(CLOCK_REALTIME); + if (cfg_start_time_ns < now) + return; + + err = usleep((cfg_start_time_ns - now) / 1000); + if (err) + error(1, errno, "usleep"); } static void setsockopt_txtime(int fd) @@ -245,6 +294,10 @@ static int setup_tx(struct sockaddr *addr, socklen_t alen) setsockopt_txtime(fd); + if (cfg_mark && + setsockopt(fd, SOL_SOCKET, SO_MARK, &cfg_mark, sizeof(cfg_mark))) + error(1, errno, "setsockopt mark"); + return fd; } @@ -266,31 +319,70 @@ static int setup_rx(struct sockaddr *addr, socklen_t alen) return fd; } -static void do_test(struct sockaddr *addr, socklen_t alen) +static void do_test_tx(struct sockaddr *addr, socklen_t alen) { - int fdt, fdr, i; + int fdt, i; fprintf(stderr, "\nSO_TXTIME ipv%c clock %s\n", addr->sa_family == PF_INET ? '4' : '6', cfg_clockid == CLOCK_TAI ? "tai" : "monotonic"); fdt = setup_tx(addr, alen); - fdr = setup_rx(addr, alen); - glob_tstart = gettime_ns(); + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); for (i = 0; i < cfg_num_pkt; i++) - do_send_one(fdt, &cfg_in[i]); + do_send_one(fdt, &cfg_buf[i]); + + recv_errqueue_msgs(fdt); + + if (close(fdt)) + error(1, errno, "close t"); +} + +static void do_test_rx(struct sockaddr *addr, socklen_t alen) +{ + int fdr, i; + + fdr = setup_rx(addr, alen); + + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); + for (i = 0; i < cfg_num_pkt; i++) - if (do_recv_one(fdr, &cfg_out[i])) - do_recv_errqueue_timeout(fdt); + do_recv_one(fdr, &cfg_buf[i]); do_recv_verify_empty(fdr); if (close(fdr)) error(1, errno, "close r"); - if (close(fdt)) - error(1, errno, "close t"); +} + +static void setup_sockaddr(int domain, const char *str_addr, + struct sockaddr_storage *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + } } static int parse_io(const char *optarg, struct timed_send *array) @@ -323,17 +415,46 @@ static int parse_io(const char *optarg, struct timed_send *array) return aoff / 2; } +static void usage(const char *progname) +{ + fprintf(stderr, "\nUsage: %s [options] <payload>\n" + "Options:\n" + " -4 only IPv4\n" + " -6 only IPv6\n" + " -c <clock> monotonic (default) or tai\n" + " -D <addr> destination IP address (server)\n" + " -S <addr> source IP address (client)\n" + " -r run rx mode\n" + " -t <nsec> start time (UTC nanoseconds)\n" + " -m <mark> socket mark\n" + "\n", + progname); + exit(1); +} + static void parse_opts(int argc, char **argv) { - int c, ilen, olen; + char *daddr = NULL, *saddr = NULL; + int domain = PF_UNSPEC; + int c; - while ((c = getopt(argc, argv, "46c:")) != -1) { + while ((c = getopt(argc, argv, "46c:S:D:rt:m:")) != -1) { switch (c) { case '4': - cfg_do_ipv4 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + cfg_errq_level = SOL_IP; + cfg_errq_type = IP_RECVERR; break; case '6': - cfg_do_ipv6 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + cfg_errq_level = SOL_IPV6; + cfg_errq_type = IPV6_RECVERR; break; case 'c': if (!strcmp(optarg, "tai")) @@ -344,50 +465,50 @@ static void parse_opts(int argc, char **argv) else error(1, 0, "unknown clock id %s", optarg); break; + case 'S': + saddr = optarg; + break; + case 'D': + daddr = optarg; + break; + case 'r': + cfg_rx = true; + break; + case 't': + cfg_start_time_ns = strtol(optarg, NULL, 0); + break; + case 'm': + cfg_mark = strtol(optarg, NULL, 0); + break; default: - error(1, 0, "parse error at %d", optind); + usage(argv[0]); } } - if (argc - optind != 2) - error(1, 0, "Usage: %s [-46] -c <clock> <in> <out>", argv[0]); + if (argc - optind != 1) + usage(argv[0]); + + if (domain == PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + if (!daddr) + error(1, 0, "-D <server addr> required\n"); + if (!cfg_rx && !saddr) + error(1, 0, "-S <client addr> required\n"); - ilen = parse_io(argv[optind], cfg_in); - olen = parse_io(argv[optind + 1], cfg_out); - if (ilen != olen) - error(1, 0, "i/o streams len mismatch (%d, %d)\n", ilen, olen); - cfg_num_pkt = ilen; + setup_sockaddr(domain, daddr, &cfg_dst_addr); + setup_sockaddr(domain, saddr, &cfg_src_addr); + + cfg_num_pkt = parse_io(argv[optind], cfg_buf); } int main(int argc, char **argv) { parse_opts(argc, argv); - if (cfg_do_ipv6) { - struct sockaddr_in6 addr6 = {0}; - - addr6.sin6_family = AF_INET6; - addr6.sin6_port = htons(cfg_port); - addr6.sin6_addr = in6addr_loopback; - - cfg_errq_level = SOL_IPV6; - cfg_errq_type = IPV6_RECVERR; - - do_test((void *)&addr6, sizeof(addr6)); - } - - if (cfg_do_ipv4) { - struct sockaddr_in addr4 = {0}; - - addr4.sin_family = AF_INET; - addr4.sin_port = htons(cfg_port); - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - - cfg_errq_level = SOL_IP; - cfg_errq_type = IP_RECVERR; - - do_test((void *)&addr4, sizeof(addr4)); - } + if (cfg_rx) + do_test_rx((void *)&cfg_dst_addr, cfg_alen); + else + do_test_tx((void *)&cfg_src_addr, cfg_alen); return 0; } diff --git a/tools/testing/selftests/net/so_txtime.sh b/tools/testing/selftests/net/so_txtime.sh index 3f7800eaecb1..3f06f4d286a9 100755 --- a/tools/testing/selftests/net/so_txtime.sh +++ b/tools/testing/selftests/net/so_txtime.sh @@ -3,32 +3,85 @@ # # Regression tests for the SO_TXTIME interface -# Run in network namespace -if [[ $# -eq 0 ]]; then - if ! ./in_netns.sh $0 __subprocess; then - # test is time sensitive, can be flaky - echo "test failed: retry once" - ./in_netns.sh $0 __subprocess +set -e + +readonly DEV="veth0" +readonly BIN="./so_txtime" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" +} + +trap cleanup EXIT + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" netns "${NS1}" type veth \ + peer name "${DEV}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +do_test() { + local readonly IP="$1" + local readonly CLOCK="$2" + local readonly TXARGS="$3" + local readonly RXARGS="$4" + + if [[ "${IP}" == "4" ]]; then + local readonly SADDR="${SADDR4}" + local readonly DADDR="${DADDR4}" + elif [[ "${IP}" == "6" ]]; then + local readonly SADDR="${SADDR6}" + local readonly DADDR="${DADDR6}" + else + echo "Invalid IP version ${IP}" + exit 1 fi - exit $? -fi + local readonly START="$(date +%s%N --date="+ 0.1 seconds")" + ip netns exec "${NS2}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${RXARGS}" -r & + ip netns exec "${NS1}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${TXARGS}" + wait "$!" +} -set -e +ip netns exec "${NS1}" tc qdisc add dev "${DEV}" root fq +do_test 4 mono a,-1 a,-1 +do_test 6 mono a,0 a,0 +do_test 6 mono a,10 a,10 +do_test 4 mono a,10,b,20 a,10,b,20 +do_test 6 mono a,20,b,10 b,20,a,20 -tc qdisc add dev lo root fq -./so_txtime -4 -6 -c mono a,-1 a,-1 -./so_txtime -4 -6 -c mono a,0 a,0 -./so_txtime -4 -6 -c mono a,10 a,10 -./so_txtime -4 -6 -c mono a,10,b,20 a,10,b,20 -./so_txtime -4 -6 -c mono a,20,b,10 b,20,a,20 - -if tc qdisc replace dev lo root etf clockid CLOCK_TAI delta 400000; then - ! ./so_txtime -4 -6 -c tai a,-1 a,-1 - ! ./so_txtime -4 -6 -c tai a,0 a,0 - ./so_txtime -4 -6 -c tai a,10 a,10 - ./so_txtime -4 -6 -c tai a,10,b,20 a,10,b,20 - ./so_txtime -4 -6 -c tai a,20,b,10 b,10,a,20 +if ip netns exec "${NS1}" tc qdisc replace dev "${DEV}" root etf clockid CLOCK_TAI delta 400000; then + ! do_test 4 tai a,-1 a,-1 + ! do_test 6 tai a,0 a,0 + do_test 6 tai a,10 a,10 + do_test 4 tai a,10,b,20 a,10,b,20 + do_test 6 tai a,20,b,10 b,10,a,20 else echo "tc ($(tc -V)) does not support qdisc etf. skipping" fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh new file mode 100755 index 000000000000..a8fa64136282 --- /dev/null +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -0,0 +1,251 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly BASE="ns-$(mktemp -u XXXXXX)" +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +# "overlay" network used for UDP over UDP tunnel traffic +readonly OL_NET_V4=172.16.1. +readonly OL_NET_V6=2001:db8:1:: +readonly NPROCS=`nproc` + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local net + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns + ip -n $BASE$ns link set dev veth$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +} + +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_dev=$2 + local -r bm_rem_addr=$3 + local -r vxlan_dev=$4 + local -r vxlan_id=$5 + local -r vxlan_port=4789 + + ip -n $netns link set dev $bm_dev up + ip -n $netns link add dev $vxlan_dev type vxlan id $vxlan_id \ + dstport $vxlan_port remote $bm_rem_addr + ip -n $netns link set dev $vxlan_dev up +} + +create_vxlan_pair() { + local ns + + create_ns + + for ns in $SRC $DST; do + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V4$((3 - $ns)) vxlan$ns 4 + ip -n $BASE$ns addr add dev vxlan$ns $OL_NET_V4$ns/24 + done + for ns in $SRC $DST; do + create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6 + ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad + done +} + +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +run_test() { + local -r msg=$1 + local -r dst=$2 + local -r pkts=$3 + local -r vxpkts=$4 + local bind=$5 + local rx_args="" + local rx_family="-4" + local family=-4 + local filter=IpInReceives + local ipt=iptables + + printf "%-40s" "$msg" + + if is_ipv6 $dst; then + # rx program does not support '-6' and implies ipv6 usage by default + rx_family="" + family=-6 + filter=Ip6InReceives + ipt=ip6tables + fi + + rx_args="$rx_family" + [ -n "$bind" ] && rx_args="$rx_args -b $bind" + + # send a single GSO packet, segmented in 10 UDP frames. + # Always expect 10 UDP frames on RX side as rx socket does + # not enable GRO + ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789 + ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000 + ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args & + local spid=$! + sleep 0.1 + ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \ + sed -e 's/\[//' -e 's/:.*//'` + if [ $rcv != $pkts ]; then + echo " fail - received $rvs packets, expected $pkts" + ret=1 + return + fi + + local vxrcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 4789' | \ + sed -e 's/\[//' -e 's/:.*//'` + + # upper net can generate a little noise, allow some tolerance + if [ $vxrcv -lt $vxpkts -o $vxrcv -gt $((vxpkts + 3)) ]; then + echo " fail - received $vxrcv vxlan packets, expected $vxpkts" + ret=1 + return + fi + echo " ok" +} + +run_bench() { + local -r msg=$1 + local -r dst=$2 + local family=-4 + + printf "%-40s" "$msg" + if [ $NPROCS -lt 2 ]; then + echo " skip - needed 2 CPUs found $NPROCS" + return + fi + + is_ipv6 $dst && family=-6 + + # bind the sender and the receiver to different CPUs to try + # get reproducible results + ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus" + ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi +} + +for family in 4 6; do + BM_NET=$BM_NET_V4 + OL_NET=$OL_NET_V4 + IPT=iptables + SUFFIX=24 + VXDEV=vxlan + + if [ $family = 6 ]; then + BM_NET=$BM_NET_V6 + OL_NET=$OL_NET_V6 + SUFFIX="64 nodad" + VXDEV=vxlan6 + IPT=ip6tables + fi + + echo "IPv$family" + + create_ns + run_test "No GRO" $BM_NET$DST 10 0 + cleanup + + create_ns + ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on + run_test "GRO frag list" $BM_NET$DST 1 0 + cleanup + + # UDP GRO fwd skips aggregation when find an udp socket with the GRO option + # if there is an UDP tunnel in the running system, such lookup happen + # take place. + # use NAT to circumvent GRO FWD check + create_ns + ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \ + -j DNAT --to-destination $BM_NET$DST + run_test "GRO fwd" $BM_NET$DST_NAT 1 0 $BM_NET$DST + cleanup + + create_ns + run_bench "UDP fwd perf" $BM_NET$DST + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + run_bench "UDP GRO fwd perf" $BM_NET$DST + cleanup + + create_vxlan_pair + ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + cleanup + + # use NAT to circumvent GRO FWD check + create_vxlan_pair + ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \ + -j DNAT --to-destination $OL_NET$DST + + # load arp cache before running the test to reduce the amount of + # stray traffic on top of the UDP tunnel + ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST + cleanup + + create_vxlan_pair + run_bench "UDP tunnel fwd perf" $OL_NET$DST + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + cleanup +done + +exit $ret diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh new file mode 100755 index 000000000000..2fedc0781ce8 --- /dev/null +++ b/tools/testing/selftests/net/veth.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" +readonly BASE=`basename $STATS` +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +readonly NPROCS=`nproc` +ret=0 + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + rm -f $STATS + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + echo "#kernel" > $BASE + chmod go-rw $BASE +} + +__chk_flag() { + local msg="$1" + local target=$2 + local expected=$3 + local flagname=$4 + + local flag=`ip netns exec $BASE$target ethtool -k veth$target |\ + grep $flagname | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$flag" = "$expected" ]; then + echo " ok " + else + echo " fail - expected $expected found $flag" + ret=1 + fi +} + +chk_gro_flag() { + __chk_flag "$1" $2 $3 generic-receive-offload +} + +chk_tso_flag() { + __chk_flag "$1" $2 $3 tcp-segmentation-offload +} + +chk_gro() { + local msg="$1" + local expected=$2 + + ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null + NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n + + printf "%-60s" "$msg" + ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \ + awk '{print $2}' | tail -n 1` + if [ "$pkts" = "$expected" ]; then + echo " ok " + else + echo " fail - got $pkts packets, expected $expected " + ret=1 + fi +} + +if [ ! -f ../bpf/xdp_dummy.o ]; then + echo "Missing xdp_dummy helper. Build bpf selftest first" + exit -1 +fi + +create_ns +chk_gro_flag "default - gro flag" $SRC off +chk_gro_flag " - peer gro flag" $DST off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +chk_gro " - aggregation" 1 +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro " - aggregation with TSO off" 10 +cleanup + +create_ns +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro on - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST down +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro enabled on link down - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +chk_gro_flag "with xdp attached - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC off +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation" 1 + + +ip -n $NS_DST link set dev veth$DST down +ip -n $NS_SRC link set dev veth$SRC down +chk_gro_flag " - after dev off, flag" $DST on +chk_gro_flag " - peer flag" $SRC off + +ip netns exec $NS_DST ethtool -K veth$DST gro on +ip -n $NS_DST link set dev veth$DST xdp off +chk_gro_flag " - after gro on xdp off, gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip -n $NS_SRC link set dev veth$SRC up +chk_gro " - aggregation" 1 + +ip netns exec $NS_DST ethtool -K veth$DST gro off +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro "aggregation again with default and TSO off" 10 + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 431296c0f91c..427d94816f2d 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -371,6 +371,88 @@ else ip netns exec nsr1 nft list ruleset fi +# Another test: +# Add bridge interface br0 to Router1, with NAT enabled. +ip -net nsr1 link add name br0 type bridge +ip -net nsr1 addr flush dev veth0 +ip -net nsr1 link set up dev veth0 +ip -net nsr1 link set veth0 master br0 +ip -net nsr1 addr add 10.0.1.1/24 dev br0 +ip -net nsr1 addr add dead:1::1/64 dev br0 +ip -net nsr1 link set up dev br0 + +ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null + +# br0 with NAT enabled. +ip netns exec nsr1 nft -f - <<EOF +flush table ip nat +table ip nat { + chain prerouting { + type nat hook prerouting priority 0; policy accept; + meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345 + } + + chain postrouting { + type nat hook postrouting priority 0; policy accept; + meta oifname "veth1" counter masquerade + } +} +EOF + +if test_tcp_forwarding_nat ns1 ns2; then + echo "PASS: flow offloaded for ns1/ns2 with bridge NAT" +else + echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# Another test: +# Add bridge interface br0 to Router1, with NAT and VLAN. +ip -net nsr1 link set veth0 nomaster +ip -net nsr1 link set down dev veth0 +ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10 +ip -net nsr1 link set up dev veth0 +ip -net nsr1 link set up dev veth0.10 +ip -net nsr1 link set veth0.10 master br0 + +ip -net ns1 addr flush dev eth0 +ip -net ns1 link add link eth0 name eth0.10 type vlan id 10 +ip -net ns1 link set eth0 up +ip -net ns1 link set eth0.10 up +ip -net ns1 addr add 10.0.1.99/24 dev eth0.10 +ip -net ns1 route add default via 10.0.1.1 +ip -net ns1 addr add dead:1::99/64 dev eth0.10 + +if test_tcp_forwarding_nat ns1 ns2; then + echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN" +else + echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# restore test topology (remove bridge and VLAN) +ip -net nsr1 link set veth0 nomaster +ip -net nsr1 link set veth0 down +ip -net nsr1 link set veth0.10 down +ip -net nsr1 link delete veth0.10 type vlan +ip -net nsr1 link delete br0 type bridge +ip -net ns1 addr flush dev eth0.10 +ip -net ns1 link set eth0.10 down +ip -net ns1 link set eth0 down +ip -net ns1 link delete eth0.10 type vlan + +# restore address in ns1 and nsr1 +ip -net ns1 link set eth0 up +ip -net ns1 addr add 10.0.1.99/24 dev eth0 +ip -net ns1 route add default via 10.0.1.1 +ip -net ns1 addr add dead:1::99/64 dev eth0 +ip -net ns1 route add default via dead:1::1 +ip -net nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net nsr1 addr add dead:1::1/64 dev veth0 +ip -net nsr1 link set up dev veth0 + KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) SPI1=$RANDOM diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore new file mode 100644 index 000000000000..790c47001e77 --- /dev/null +++ b/tools/testing/selftests/perf_events/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +sigtrap_threads +remove_on_exec diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile new file mode 100644 index 000000000000..fcafa5f0d34c --- /dev/null +++ b/tools/testing/selftests/perf_events/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include +LDFLAGS += -lpthread + +TEST_GEN_PROGS := sigtrap_threads remove_on_exec +include ../lib.mk diff --git a/tools/testing/selftests/perf_events/config b/tools/testing/selftests/perf_events/config new file mode 100644 index 000000000000..ba58ff2203e4 --- /dev/null +++ b/tools/testing/selftests/perf_events/config @@ -0,0 +1 @@ +CONFIG_PERF_EVENTS=y diff --git a/tools/testing/selftests/perf_events/remove_on_exec.c b/tools/testing/selftests/perf_events/remove_on_exec.c new file mode 100644 index 000000000000..5814611a1dc7 --- /dev/null +++ b/tools/testing/selftests/perf_events/remove_on_exec.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for remove_on_exec. + * + * Copyright (C) 2021, Google LLC. + */ + +#define _GNU_SOURCE + +/* We need the latest siginfo from the kernel repo. */ +#include <sys/types.h> +#include <asm/siginfo.h> +#define __have_siginfo_t 1 +#define __have_sigval_t 1 +#define __have_sigevent_t 1 +#define __siginfo_t_defined +#define __sigval_t_defined +#define __sigevent_t_defined +#define _BITS_SIGINFO_CONSTS_H 1 +#define _BITS_SIGEVENT_CONSTS_H 1 + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <linux/perf_event.h> +#include <pthread.h> +#include <signal.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include <unistd.h> + +#include "../kselftest_harness.h" + +static volatile int signal_count; + +static struct perf_event_attr make_event_attr(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .size = sizeof(attr), + .config = PERF_COUNT_HW_INSTRUCTIONS, + .sample_period = 1000, + .exclude_kernel = 1, + .exclude_hv = 1, + .disabled = 1, + .inherit = 1, + /* + * Children normally retain their inherited event on exec; with + * remove_on_exec, we'll remove their event, but the parent and + * any other non-exec'd children will keep their events. + */ + .remove_on_exec = 1, + .sigtrap = 1, + }; + return attr; +} + +static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext) +{ + if (info->si_code != TRAP_PERF) { + fprintf(stderr, "%s: unexpected si_code %d\n", __func__, info->si_code); + return; + } + + signal_count++; +} + +FIXTURE(remove_on_exec) +{ + struct sigaction oldact; + int fd; +}; + +FIXTURE_SETUP(remove_on_exec) +{ + struct perf_event_attr attr = make_event_attr(); + struct sigaction action = {}; + + signal_count = 0; + + /* Initialize sigtrap handler. */ + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = sigtrap_handler; + sigemptyset(&action.sa_mask); + ASSERT_EQ(sigaction(SIGTRAP, &action, &self->oldact), 0); + + /* Initialize perf event. */ + self->fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); + ASSERT_NE(self->fd, -1); +} + +FIXTURE_TEARDOWN(remove_on_exec) +{ + close(self->fd); + sigaction(SIGTRAP, &self->oldact, NULL); +} + +/* Verify event propagates to fork'd child. */ +TEST_F(remove_on_exec, fork_only) +{ + int status; + pid_t pid = fork(); + + if (pid == 0) { + ASSERT_EQ(signal_count, 0); + ASSERT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + while (!signal_count); + _exit(42); + } + + while (!signal_count); /* Child enables event. */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(WEXITSTATUS(status), 42); +} + +/* + * Verify that event does _not_ propagate to fork+exec'd child; event enabled + * after fork+exec. + */ +TEST_F(remove_on_exec, fork_exec_then_enable) +{ + pid_t pid_exec, pid_only_fork; + int pipefd[2]; + int tmp; + + /* + * Non-exec child, to ensure exec does not affect inherited events of + * other children. + */ + pid_only_fork = fork(); + if (pid_only_fork == 0) { + /* Block until parent enables event. */ + while (!signal_count); + _exit(42); + } + + ASSERT_NE(pipe(pipefd), -1); + pid_exec = fork(); + if (pid_exec == 0) { + ASSERT_NE(dup2(pipefd[1], STDOUT_FILENO), -1); + close(pipefd[0]); + execl("/proc/self/exe", "exec_child", NULL); + _exit((perror("exec failed"), 1)); + } + close(pipefd[1]); + + ASSERT_EQ(waitpid(pid_exec, &tmp, WNOHANG), 0); /* Child is running. */ + /* Wait for exec'd child to start spinning. */ + EXPECT_EQ(read(pipefd[0], &tmp, sizeof(int)), sizeof(int)); + EXPECT_EQ(tmp, 42); + close(pipefd[0]); + /* Now we can enable the event, knowing the child is doing work. */ + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + /* If the event propagated to the exec'd child, it will exit normally... */ + usleep(100000); /* ... give time for event to trigger (in case of bug). */ + EXPECT_EQ(waitpid(pid_exec, &tmp, WNOHANG), 0); /* Should still be running. */ + EXPECT_EQ(kill(pid_exec, SIGKILL), 0); + + /* Verify removal from child did not affect this task's event. */ + tmp = signal_count; + while (signal_count == tmp); /* Should not hang! */ + /* Nor should it have affected the first child. */ + EXPECT_EQ(waitpid(pid_only_fork, &tmp, 0), pid_only_fork); + EXPECT_EQ(WEXITSTATUS(tmp), 42); +} + +/* + * Verify that event does _not_ propagate to fork+exec'd child; event enabled + * before fork+exec. + */ +TEST_F(remove_on_exec, enable_then_fork_exec) +{ + pid_t pid_exec; + int tmp; + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + + pid_exec = fork(); + if (pid_exec == 0) { + execl("/proc/self/exe", "exec_child", NULL); + _exit((perror("exec failed"), 1)); + } + + /* + * The child may exit abnormally at any time if the event propagated and + * a SIGTRAP is sent before the handler was set up. + */ + usleep(100000); /* ... give time for event to trigger (in case of bug). */ + EXPECT_EQ(waitpid(pid_exec, &tmp, WNOHANG), 0); /* Should still be running. */ + EXPECT_EQ(kill(pid_exec, SIGKILL), 0); + + /* Verify removal from child did not affect this task's event. */ + tmp = signal_count; + while (signal_count == tmp); /* Should not hang! */ +} + +TEST_F(remove_on_exec, exec_stress) +{ + pid_t pids[30]; + int i, tmp; + + for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) { + pids[i] = fork(); + if (pids[i] == 0) { + execl("/proc/self/exe", "exec_child", NULL); + _exit((perror("exec failed"), 1)); + } + + /* Some forked with event disabled, rest with enabled. */ + if (i > 10) + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + } + + usleep(100000); /* ... give time for event to trigger (in case of bug). */ + + for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) { + /* All children should still be running. */ + EXPECT_EQ(waitpid(pids[i], &tmp, WNOHANG), 0); + EXPECT_EQ(kill(pids[i], SIGKILL), 0); + } + + /* Verify event is still alive. */ + tmp = signal_count; + while (signal_count == tmp); +} + +/* For exec'd child. */ +static void exec_child(void) +{ + struct sigaction action = {}; + const int val = 42; + + /* Set up sigtrap handler in case we erroneously receive a trap. */ + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = sigtrap_handler; + sigemptyset(&action.sa_mask); + if (sigaction(SIGTRAP, &action, NULL)) + _exit((perror("sigaction failed"), 1)); + + /* Signal parent that we're starting to spin. */ + if (write(STDOUT_FILENO, &val, sizeof(int)) == -1) + _exit((perror("write failed"), 1)); + + /* Should hang here until killed. */ + while (!signal_count); +} + +#define main test_main +TEST_HARNESS_MAIN +#undef main +int main(int argc, char *argv[]) +{ + if (!strcmp(argv[0], "exec_child")) { + exec_child(); + return 1; + } + + return test_main(argc, argv); +} diff --git a/tools/testing/selftests/perf_events/settings b/tools/testing/selftests/perf_events/settings new file mode 100644 index 000000000000..6091b45d226b --- /dev/null +++ b/tools/testing/selftests/perf_events/settings @@ -0,0 +1 @@ +timeout=120 diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c new file mode 100644 index 000000000000..78ddf5e11625 --- /dev/null +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for perf events with SIGTRAP across all threads. + * + * Copyright (C) 2021, Google LLC. + */ + +#define _GNU_SOURCE + +/* We need the latest siginfo from the kernel repo. */ +#include <sys/types.h> +#include <asm/siginfo.h> +#define __have_siginfo_t 1 +#define __have_sigval_t 1 +#define __have_sigevent_t 1 +#define __siginfo_t_defined +#define __sigval_t_defined +#define __sigevent_t_defined +#define _BITS_SIGINFO_CONSTS_H 1 +#define _BITS_SIGEVENT_CONSTS_H 1 + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <linux/hw_breakpoint.h> +#include <linux/perf_event.h> +#include <pthread.h> +#include <signal.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include <unistd.h> + +#include "../kselftest_harness.h" + +#define NUM_THREADS 5 + +/* Data shared between test body, threads, and signal handler. */ +static struct { + int tids_want_signal; /* Which threads still want a signal. */ + int signal_count; /* Sanity check number of signals received. */ + volatile int iterate_on; /* Variable to set breakpoint on. */ + siginfo_t first_siginfo; /* First observed siginfo_t. */ +} ctx; + +/* Unique value to check si_perf is correctly set from perf_event_attr::sig_data. */ +#define TEST_SIG_DATA(addr) (~(unsigned long)(addr)) + +static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_BREAKPOINT, + .size = sizeof(attr), + .sample_period = 1, + .disabled = !enabled, + .bp_addr = (unsigned long)addr, + .bp_type = HW_BREAKPOINT_RW, + .bp_len = HW_BREAKPOINT_LEN_1, + .inherit = 1, /* Children inherit events ... */ + .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. */ + .remove_on_exec = 1, /* Required by sigtrap. */ + .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ + .sig_data = TEST_SIG_DATA(addr), + }; + return attr; +} + +static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext) +{ + if (info->si_code != TRAP_PERF) { + fprintf(stderr, "%s: unexpected si_code %d\n", __func__, info->si_code); + return; + } + + /* + * The data in siginfo_t we're interested in should all be the same + * across threads. + */ + if (!__atomic_fetch_add(&ctx.signal_count, 1, __ATOMIC_RELAXED)) + ctx.first_siginfo = *info; + __atomic_fetch_sub(&ctx.tids_want_signal, syscall(__NR_gettid), __ATOMIC_RELAXED); +} + +static void *test_thread(void *arg) +{ + pthread_barrier_t *barrier = (pthread_barrier_t *)arg; + pid_t tid = syscall(__NR_gettid); + int iter; + int i; + + pthread_barrier_wait(barrier); + + __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); + iter = ctx.iterate_on; /* read */ + for (i = 0; i < iter - 1; i++) { + __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); + ctx.iterate_on = iter; /* idempotent write */ + } + + return NULL; +} + +FIXTURE(sigtrap_threads) +{ + struct sigaction oldact; + pthread_t threads[NUM_THREADS]; + pthread_barrier_t barrier; + int fd; +}; + +FIXTURE_SETUP(sigtrap_threads) +{ + struct perf_event_attr attr = make_event_attr(false, &ctx.iterate_on); + struct sigaction action = {}; + int i; + + memset(&ctx, 0, sizeof(ctx)); + + /* Initialize sigtrap handler. */ + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = sigtrap_handler; + sigemptyset(&action.sa_mask); + ASSERT_EQ(sigaction(SIGTRAP, &action, &self->oldact), 0); + + /* Initialize perf event. */ + self->fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); + ASSERT_NE(self->fd, -1); + + /* Spawn threads inheriting perf event. */ + pthread_barrier_init(&self->barrier, NULL, NUM_THREADS + 1); + for (i = 0; i < NUM_THREADS; i++) + ASSERT_EQ(pthread_create(&self->threads[i], NULL, test_thread, &self->barrier), 0); +} + +FIXTURE_TEARDOWN(sigtrap_threads) +{ + pthread_barrier_destroy(&self->barrier); + close(self->fd); + sigaction(SIGTRAP, &self->oldact, NULL); +} + +static void run_test_threads(struct __test_metadata *_metadata, + FIXTURE_DATA(sigtrap_threads) *self) +{ + int i; + + pthread_barrier_wait(&self->barrier); + for (i = 0; i < NUM_THREADS; i++) + ASSERT_EQ(pthread_join(self->threads[i], NULL), 0); +} + +TEST_F(sigtrap_threads, remain_disabled) +{ + run_test_threads(_metadata, self); + EXPECT_EQ(ctx.signal_count, 0); + EXPECT_NE(ctx.tids_want_signal, 0); +} + +TEST_F(sigtrap_threads, enable_event) +{ + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + run_test_threads(_metadata, self); + + EXPECT_EQ(ctx.signal_count, NUM_THREADS); + EXPECT_EQ(ctx.tids_want_signal, 0); + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + + /* Check enabled for parent. */ + ctx.iterate_on = 0; + EXPECT_EQ(ctx.signal_count, NUM_THREADS + 1); +} + +/* Test that modification propagates to all inherited events. */ +TEST_F(sigtrap_threads, modify_and_enable_event) +{ + struct perf_event_attr new_attr = make_event_attr(true, &ctx.iterate_on); + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &new_attr), 0); + run_test_threads(_metadata, self); + + EXPECT_EQ(ctx.signal_count, NUM_THREADS); + EXPECT_EQ(ctx.tids_want_signal, 0); + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + + /* Check enabled for parent. */ + ctx.iterate_on = 0; + EXPECT_EQ(ctx.signal_count, NUM_THREADS + 1); +} + +/* Stress test event + signal handling. */ +TEST_F(sigtrap_threads, signal_stress) +{ + ctx.iterate_on = 3000; + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + run_test_threads(_metadata, self); + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0); + + EXPECT_EQ(ctx.signal_count, NUM_THREADS * ctx.iterate_on); + EXPECT_EQ(ctx.tids_want_signal, 0); + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index af3df79d8163..c5ecb4634094 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -include ../../../../../../scripts/Kbuild.include +include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh index 1dbfb62567d2..6bb993001680 100755 --- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh +++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh @@ -21,7 +21,6 @@ then awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` else # No mpstat command, so use all available CPUs. - echo The mpstat command is not available, so greedily using all CPUs. idlecpus=$ncpus fi awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null ' diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 188b864bc4bf..15d937ba96ca 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -5,10 +5,11 @@ # of this script is to inflict random OS jitter on a concurrently running # test. # -# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# Usage: jitter.sh me jittering-path duration [ sleepmax [ spinmax ] ] # # me: Random-number-generator seed salt. # duration: Time to run in seconds. +# jittering-path: Path to file whose removal will stop this script. # sleepmax: Maximum microseconds to sleep, defaults to one second. # spinmax: Maximum microseconds to spin, defaults to one millisecond. # @@ -17,9 +18,10 @@ # Authors: Paul E. McKenney <paulmck@linux.ibm.com> me=$(($1 * 1000)) -duration=$2 -sleepmax=${3-1000000} -spinmax=${4-1000} +jittering=$2 +duration=$3 +sleepmax=${4-1000000} +spinmax=${5-1000} n=1 @@ -47,7 +49,7 @@ do fi # Check for stop request. - if test -f "$TORTURE_STOPFILE" + if ! test -f "$jittering" then exit 1; fi @@ -67,10 +69,10 @@ do srand(n + me + systime()); ncpus = split(cpus, ca); curcpu = ca[int(rand() * ncpus + 1)]; - mask = lshift(1, curcpu); - if (mask + 0 <= 0) - mask = 1; - printf("%#x\n", mask); + z = ""; + for (i = 1; 4 * i <= curcpu; i++) + z = z "0"; + print "0x" 2 ^ (curcpu % 4) z; }' < /dev/null` n=$(($n+1)) if ! taskset -p $cpumask $$ > /dev/null 2>&1 diff --git a/tools/testing/selftests/rcutorture/bin/jitterstart.sh b/tools/testing/selftests/rcutorture/bin/jitterstart.sh new file mode 100644 index 000000000000..3d710ad291c3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitterstart.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Start up the specified number of jitter.sh scripts in the background. +# +# Usage: . jitterstart.sh n jittering-dir duration [ sleepmax [ spinmax ] ] +# +# n: Number of jitter.sh scripts to start up. +# jittering-dir: Directory in which to put "jittering" file. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +jitter_n=$1 +if test -z "$jitter_n" +then + echo jitterstart.sh: Missing count of jitter.sh scripts to start. + exit 33 +fi +jittering_dir=$2 +if test -z "$jittering_dir" +then + echo jitterstart.sh: Missing directory in which to place jittering file. + exit 34 +fi +shift +shift + +touch ${jittering_dir}/jittering +for ((jitter_i = 1; jitter_i <= $jitter_n; jitter_i++)) +do + jitter.sh $jitter_i "${jittering_dir}/jittering" "$@" & +done diff --git a/tools/testing/selftests/rcutorture/bin/jitterstop.sh b/tools/testing/selftests/rcutorture/bin/jitterstop.sh new file mode 100644 index 000000000000..576a4cf4b79a --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitterstop.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Remove the "jittering" file, signaling the jitter.sh scripts to stop, +# then wait for them to terminate. +# +# Usage: . jitterstop.sh jittering-dir +# +# jittering-dir: Directory containing "jittering" file. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +jittering_dir=$1 +if test -z "$jittering_dir" +then + echo jitterstop.sh: Missing directory in which to place jittering file. + exit 34 +fi + +rm -f ${jittering_dir}/jittering +wait diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh new file mode 100755 index 000000000000..46e47a00a7db --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -0,0 +1,199 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Rerun a series of tests under KVM. +# +# Usage: kvm-again.sh /path/to/old/run [ options ] +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +scriptname=$0 +args="$*" + +T=${TMPDIR-/tmp}/kvm-again.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if ! test -d tools/testing/selftests/rcutorture/bin +then + echo $scriptname must be run from top-level directory of kernel source tree. + exit 1 +fi + +oldrun=$1 +shift +if ! test -d "$oldrun" +then + echo "Usage: $scriptname /path/to/old/run [ options ]" + exit 1 +fi +if ! cp "$oldrun/batches" $T/batches.oldrun +then + # Later on, can reconstitute this from console.log files. + echo Prior run batches file does not exist: $oldrun/batches + exit 1 +fi + +if test -f "$oldrun/torture_suite" +then + torture_suite="`cat $oldrun/torture_suite`" +elif test -f "$oldrun/TORTURE_SUITE" +then + torture_suite="`cat $oldrun/TORTURE_SUITE`" +else + echo "Prior run torture_suite file does not exist: $oldrun/{torture_suite,TORTURE_SUITE}" + exit 1 +fi + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh + +dryrun= +dur= +default_link="cp -R" +rundir="`pwd`/tools/testing/selftests/rcutorture/res/`date +%Y.%m.%d-%H.%M.%S-again`" + +startdate="`date`" +starttime="`get_starttime`" + +usage () { + echo "Usage: $scriptname $oldrun [ arguments ]:" + echo " --dryrun" + echo " --duration minutes | <seconds>s | <hours>h | <days>d" + echo " --link hard|soft|copy" + echo " --remote" + echo " --rundir /new/res/path" + exit 1 +} + +while test $# -gt 0 +do + case "$1" in + --dryrun) + dryrun=1 + ;; + --duration) + checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(s\|m\|h\|d\|\)$' '^error' + mult=60 + if echo "$2" | grep -q 's$' + then + mult=1 + elif echo "$2" | grep -q 'h$' + then + mult=3600 + elif echo "$2" | grep -q 'd$' + then + mult=86400 + fi + ts=`echo $2 | sed -e 's/[smhd]$//'` + dur=$(($ts*mult)) + shift + ;; + --link) + checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--' + case "$2" in + copy) + arg_link="cp -R" + ;; + hard) + arg_link="cp -Rl" + ;; + soft) + arg_link="cp -Rs" + ;; + esac + shift + ;; + --remote) + arg_remote=1 + default_link="cp -as" + ;; + --rundir) + checkarg --rundir "(absolute pathname)" "$#" "$2" '^/' '^error' + rundir=$2 + if test -e "$rundir" + then + echo "--rundir $2: Already exists." + usage + fi + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done +if test -z "$arg_link" +then + arg_link="$default_link" +fi + +echo ---- Re-run results directory: $rundir + +# Copy old run directory tree over and adjust. +mkdir -p "`dirname "$rundir"`" +if ! $arg_link "$oldrun" "$rundir" +then + echo "Cannot copy from $oldrun to $rundir." + usage +fi +rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log +echo $oldrun > "$rundir/re-run" +if ! test -d "$rundir/../../bin" +then + $arg_link "$oldrun/../../bin" "$rundir/../.." +fi +for i in $rundir/*/qemu-cmd +do + cp "$i" $T + qemu_cmd_dir="`dirname "$i"`" + kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" + jitter_dir="`dirname "$kernel_dir"`" + kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur < $T/qemu-cmd > $i + if test -n "$arg_remote" + then + echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i + fi +done + +# Extract settings from the last qemu-cmd file transformed above. +grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +grep -v '^#' $T/batches.oldrun | awk ' +BEGIN { + oldbatch = 1; +} + +{ + if (oldbatch != $1) { + print "kvm-test-1-run-batch.sh" curbatch; + curbatch = ""; + oldbatch = $1; + } + curbatch = curbatch " " $2; +} + +END { + print "kvm-test-1-run-batch.sh" curbatch +}' > $T/runbatches.sh + +if test -n "$dryrun" +then + echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log" +else + ( cd "$rundir"; sh $T/runbatches.sh ) + kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" + echo | tee -a "$rundir/log" + echo ---- Results directory: $rundir | tee -a "$rundir/log" + kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 + ret=$? + cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" + echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" + exit $ret +fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 47cf4db10896..e01b31b87044 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -30,7 +30,7 @@ do resdir=`echo $i | sed -e 's,/$,,' -e 's,/[^/]*$,,'` head -1 $resdir/log fi - TORTURE_SUITE="`cat $i/../TORTURE_SUITE`" + TORTURE_SUITE="`cat $i/../torture_suite`" configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags kvm-recheck-${TORTURE_SUITE}.sh $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh new file mode 100755 index 000000000000..7ea0809e229e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Carry out a kvm-based run for the specified batch of scenarios, which +# might have been built by --build-only kvm.sh run. +# +# Usage: kvm-test-1-run-batch.sh SCENARIO [ SCENARIO ... ] +# +# Each SCENARIO is the name of a directory in the current directory +# containing a ready-to-run qemu-cmd file. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +T=${TMPDIR-/tmp}/kvm-test-1-run-batch.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +echo ---- Running batch $* +# Check arguments +runfiles= +for i in "$@" +do + if ! echo $i | grep -q '^[^/.a-z]\+\(\.[0-9]\+\)\?$' + then + echo Bad scenario name: \"$i\" 1>&2 + exit 1 + fi + if ! test -d "$i" + then + echo Scenario name not a directory: \"$i\" 1>&2 + exit 2 + fi + if ! test -f "$i/qemu-cmd" + then + echo Scenario lacks a command file: \"$i/qemu-cmd\" 1>&2 + exit 3 + fi + rm -f $i/build.* + touch $i/build.run + runfiles="$runfiles $i/build.run" +done + +# Extract settings from the qemu-cmd file. +grep '^#' $1/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +# Start up jitter, start each scenario, wait, end jitter. +echo ---- System running test: `uname -a` +echo ---- Starting kernels. `date` | tee -a log +$TORTURE_JITTER_START +for i in "$@" +do + echo ---- System running test: `uname -a` > $i/kvm-test-1-run-qemu.sh.out + echo > $i/kvm-test-1-run-qemu.sh.out + kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 & +done +for i in $runfiles +do + while ls $i > /dev/null 2>&1 + do + : + done +done +echo ---- All kernel runs complete. `date` | tee -a log +$TORTURE_JITTER_STOP diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh new file mode 100755 index 000000000000..5b1aa2a4f3f6 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Carry out a kvm-based run for the specified qemu-cmd file, which might +# have been generated by --build-only kvm.sh run. +# +# Usage: kvm-test-1-run-qemu.sh qemu-cmd-dir +# +# qemu-cmd-dir provides the directory containing qemu-cmd file. +# This is assumed to be of the form prefix/ds/scenario, where +# "ds" is the top-level date-stamped directory and "scenario" +# is the scenario name. Any required adjustments to this file +# must have been made by the caller. The shell-command comments +# at the end of the qemu-cmd file are not optional. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +T=${TMPDIR-/tmp}/kvm-test-1-run-qemu.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +resdir="$1" +if ! test -d "$resdir" +then + echo $0: Nonexistent directory: $resdir + exit 1 +fi +if ! test -f "$resdir/qemu-cmd" +then + echo $0: Nonexistent qemu-cmd file: $resdir/qemu-cmd + exit 1 +fi + +echo ' ---' `date`: Starting kernel, PID $$ + +# Obtain settings from the qemu-cmd file. +grep '^#' $resdir/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +# Decorate qemu-cmd with redirection, backgrounding, and PID capture +sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd +echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd + +# In case qemu refuses to run... +echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log + +# Attempt to run qemu +kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` +( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +commandcompleted=0 +if test -z "$TORTURE_KCONFIG_GDB_ARG" +then + sleep 10 # Give qemu's pid a chance to reach the file + if test -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid + else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid + fi +fi +if test -n "$TORTURE_KCONFIG_GDB_ARG" +then + base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` + if ! test -f $base_resdir/vmlinux + then + base_resdir="`cat re-run`/$resdir" + if ! test -f $base_resdir/vmlinux + then + base_resdir=/path/to + fi + fi + echo Waiting for you to attach a debug session, for example: > /dev/tty + echo " gdb $base_resdir/vmlinux" > /dev/tty + echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty + echo " target remote :1234" > /dev/tty + echo " continue" > /dev/tty + kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` +fi +while : +do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 + then + if test -n "$TORTURE_KCONFIG_GDB_ARG" + then + : + elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1" + then + break; + fi + sleep 1 + else + commandcompleted=1 + if test $kruntime -lt $seconds + then + echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1 + grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1 + killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`" + if test -n "$killpid" + then + echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 + ps -fp $killpid >> $resdir/Warnings 2>&1 + fi + else + echo ' ---' `date`: "Kernel done" + fi + break + fi +done +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" +then + if ! test -f "$resdir/../STOP.1" + then + echo Grace period for qemu job at pid $qemu_pid + fi + oldline="`tail $resdir/console.log`" + while : + do + if test -f "$resdir/../STOP.1" + then + echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + if kill -0 $qemu_pid > /dev/null 2>&1 + then + : + else + break + fi + must_continue=no + newline="`tail $resdir/console.log`" + if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' + then + must_continue=yes + fi + last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" + if test -z "$last_ts" + then + last_ts=0 + fi + if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + must_continue=yes + fi + if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi + oldline=$newline + sleep 10 + done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command +fi + +# Tell the script that this run is done. +rm -f $resdir/build.run + +parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 536d103ef166..420ed5ce9d32 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -7,15 +7,15 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args +# Usage: kvm-test-1-run.sh config resdir seconds qemu-args boot_args_in # # qemu-args defaults to "-enable-kvm -nographic", along with arguments # specifying the number of CPUs and other options # generated from the underlying CPU architecture. -# boot_args defaults to value returned by the per_version_boot_params +# boot_args_in defaults to value returned by the per_version_boot_params # shell function. # -# Anything you specify for either qemu-args or boot_args is appended to +# Anything you specify for either qemu-args or boot_args_in is appended to # the default values. The "-smp" value is deduced from the contents of # the config fragment. # @@ -35,14 +35,13 @@ mkdir $T config_template=${1} config_dir=`echo $config_template | sed -e 's,/[^/]*$,,'` title=`echo $config_template | sed -e 's/^.*\///'` -builddir=${2} -resdir=${3} +resdir=${2} if test -z "$resdir" -o ! -d "$resdir" -o ! -w "$resdir" then echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it" exit 1 fi -echo ' ---' `date`: Starting build +echo ' ---' `date`: Starting build, PID $$ echo ' ---' Kconfig fragment at: $config_template >> $resdir/log touch $resdir/ConfigFragment.input @@ -73,7 +72,7 @@ config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` -if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux +if test "$base_resdir" != "$resdir" && test -f $base_resdir/bzImage && test -f $base_resdir/vmlinux then # Rerunning previous test, so use that test's kernel. QEMU="`identify_qemu $base_resdir/vmlinux`" @@ -83,6 +82,17 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel +elif test "$base_resdir" != "$resdir" +then + # Rerunning previous test for which build failed + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh + echo Initial build failed, not running KVM, see $resdir. + if test -f $resdir/build.wait + then + mv $resdir/build.wait $resdir/build.ready + fi + exit 1 elif kvm-build.sh $T/KcList $resdir then # Had to build a kernel for this test. @@ -107,23 +117,23 @@ else # Build failed. cp .config $resdir || : echo Build failed, not running KVM, see $resdir. - if test -f $builddir.wait + if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi exit 1 fi -if test -f $builddir.wait +if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi -while test -f $builddir.ready +while test -f $resdir/build.ready do sleep 1 done -seconds=$4 -qemu_args=$5 -boot_args=$6 +seconds=$3 +qemu_args=$4 +boot_args_in=$5 if test -z "$TORTURE_BUILDONLY" then @@ -133,7 +143,7 @@ fi # Generate -smp qemu argument. qemu_args="-enable-kvm -nographic $qemu_args" cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment` -cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` +cpu_count=`configfrag_boot_cpus "$boot_args_in" "$config_template" "$cpu_count"` if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS" then echo CPU count limited from $cpu_count to $TORTURE_ALLOTED_CPUS | tee -a $resdir/Warnings @@ -149,16 +159,52 @@ qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" qemu_append="`identify_qemu_append "$QEMU"`" # Pull in Kconfig-fragment boot parameters -boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" +boot_args="`configfrag_boot_params "$boot_args_in" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" if test -n "$TORTURE_BOOT_GDB_ARG" then boot_args="$boot_args $TORTURE_BOOT_GDB_ARG" fi + +# Give bare-metal advice +modprobe_args="`echo $boot_args | tr -s ' ' '\012' | grep "^$TORTURE_MOD\." | sed -e "s/$TORTURE_MOD\.//g"`" +kboot_args="`echo $boot_args | tr -s ' ' '\012' | grep -v "^$TORTURE_MOD\."`" +testid_txt="`dirname $resdir`/testid.txt" +touch $resdir/bare-metal +echo To run this scenario on bare metal: >> $resdir/bare-metal +echo >> $resdir/bare-metal +echo " 1." Set your bare-metal build tree to the state shown in this file: >> $resdir/bare-metal +echo " " $testid_txt >> $resdir/bare-metal +echo " 2." Update your bare-metal build tree"'"s .config based on this file: >> $resdir/bare-metal +echo " " $resdir/ConfigFragment >> $resdir/bare-metal +echo " 3." Make the bare-metal kernel"'"s build system aware of your .config updates: >> $resdir/bare-metal +echo " " $ 'yes "" | make oldconfig' >> $resdir/bare-metal +echo " 4." Build your bare-metal kernel. >> $resdir/bare-metal +echo " 5." Boot your bare-metal kernel with the following parameters: >> $resdir/bare-metal +echo " " $kboot_args >> $resdir/bare-metal +echo " 6." Start the test with the following command: >> $resdir/bare-metal +echo " " $ modprobe $TORTURE_MOD $modprobe_args >> $resdir/bare-metal +echo " 7." After some time, end the test with the following command: >> $resdir/bare-metal +echo " " $ rmmod $TORTURE_MOD >> $resdir/bare-metal +echo " 8." Copy your bare-metal kernel"'"s .config file, overwriting this file: >> $resdir/bare-metal +echo " " $resdir/.config >> $resdir/bare-metal +echo " 9." Copy the console output from just before the modprobe to just after >> $resdir/bare-metal +echo " " the rmmod into this file: >> $resdir/bare-metal +echo " " $resdir/console.log >> $resdir/bare-metal +echo "10." Check for runtime errors using the following command: >> $resdir/bare-metal +echo " " $ tools/testing/selftests/rcutorture/bin/kvm-recheck.sh `dirname $resdir` >> $resdir/bare-metal +echo >> $resdir/bare-metal +echo Some of the above steps may be skipped if you build your bare-metal >> $resdir/bare-metal +echo kernel here: `head -n 1 $testid_txt | sed -e 's/^Build directory: //'` >> $resdir/bare-metal + echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd +echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd +echo "# TORTURE_JITTER_START=\"$TORTURE_JITTER_START\"" >> $resdir/qemu-cmd +echo "# TORTURE_JITTER_STOP=\"$TORTURE_JITTER_STOP\"" >> $resdir/qemu-cmd +echo "# TORTURE_TRUST_MAKE=\"$TORTURE_TRUST_MAKE\"; export TORTURE_TRUST_MAKE" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then @@ -167,140 +213,4 @@ then exit 0 fi -# Decorate qemu-cmd with redirection, backgrounding, and PID capture -sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd -echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd - -# In case qemu refuses to run... -echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log - -# Attempt to run qemu -kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & -commandcompleted=0 -if test -z "$TORTURE_KCONFIG_GDB_ARG" -then - sleep 10 # Give qemu's pid a chance to reach the file - if test -s "$resdir/qemu_pid" - then - qemu_pid=`cat "$resdir/qemu_pid"` - echo Monitoring qemu job at pid $qemu_pid - else - qemu_pid="" - echo Monitoring qemu job at yet-as-unknown pid - fi -fi -if test -n "$TORTURE_KCONFIG_GDB_ARG" -then - echo Waiting for you to attach a debug session, for example: > /dev/tty - echo " gdb $base_resdir/vmlinux" > /dev/tty - echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty - echo " target remote :1234" > /dev/tty - echo " continue" > /dev/tty - kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -fi -while : -do - if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" - then - qemu_pid=`cat "$resdir/qemu_pid"` - fi - kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 - then - if test -n "$TORTURE_KCONFIG_GDB_ARG" - then - : - elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1" - then - break; - fi - sleep 1 - else - commandcompleted=1 - if test $kruntime -lt $seconds - then - echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1 - grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1 - killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`" - if test -n "$killpid" - then - echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 - ps -fp $killpid >> $resdir/Warnings 2>&1 - fi - # Reduce probability of PID reuse by allowing a one-minute buffer - if test $((kruntime + 60)) -lt $seconds && test -s "$resdir/../jitter_pids" - then - awk < "$resdir/../jitter_pids" ' - NF > 0 { - pidlist = pidlist " " $1; - n++; - } - END { - if (n > 0) { - print "kill " pidlist; - } - }' | sh - fi - else - echo ' ---' `date`: "Kernel done" - fi - break - fi -done -if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" -then - qemu_pid=`cat "$resdir/qemu_pid"` -fi -if test $commandcompleted -eq 0 -a -n "$qemu_pid" -then - if ! test -f "$resdir/../STOP.1" - then - echo Grace period for qemu job at pid $qemu_pid - fi - oldline="`tail $resdir/console.log`" - while : - do - if test -f "$resdir/../STOP.1" - then - echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 - kill -KILL $qemu_pid - break - fi - kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 - then - : - else - break - fi - must_continue=no - newline="`tail $resdir/console.log`" - if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' - then - must_continue=yes - fi - last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" - if test -z "$last_ts" - then - last_ts=0 - fi - if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) - then - must_continue=yes - fi - if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) - then - echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 - kill -KILL $qemu_pid - break - fi - oldline=$newline - sleep 10 - done -elif test -z "$qemu_pid" -then - echo Unknown PID, cannot kill qemu command -fi - -parse-console.sh $resdir/console.log $title +kvm-test-1-run-qemu.sh $resdir diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index c45a953ef393..d40b4e60a50c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -3,7 +3,7 @@ # # Transform a qemu-cmd file to allow reuse. # -# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out +# Usage: kvm-transform.sh bzImage console.log jitter_dir [ seconds ] < qemu-cmd-in > qemu-cmd-out # # bzImage: Kernel and initrd from the same prior kvm.sh run. # console.log: File into which to place console output. @@ -29,20 +29,62 @@ then echo "Need console log file name." exit 1 fi +jitter_dir="$3" +if test -z "$jitter_dir" || ! test -d "$jitter_dir" +then + echo "Need valid jitter directory: '$jitter_dir'" + exit 1 +fi +seconds="$4" +if test -n "$seconds" && echo $seconds | grep -q '[^0-9]' +then + echo "Invalid duration, should be numeric in seconds: '$seconds'" + exit 1 +fi + +awk -v image="$image" -v consolelog="$consolelog" -v jitter_dir="$jitter_dir" \ + -v seconds="$seconds" ' +/^# seconds=/ { + if (seconds == "") + print $0; + else + print "# seconds=" seconds; + next; +} + +/^# TORTURE_JITTER_START=/ { + print "# TORTURE_JITTER_START=\". jitterstart.sh " $4 " " jitter_dir " " $6 " " $7; + next; +} + +/^# TORTURE_JITTER_STOP=/ { + print "# TORTURE_JITTER_STOP=\". jitterstop.sh " " " jitter_dir " " $5; + next; +} + +/^#/ { + print $0; + next; +} -awk -v image="$image" -v consolelog="$consolelog" ' { line = ""; for (i = 1; i <= NF; i++) { - if (line == "") + if ("" seconds != "" && $i ~ /\.shutdown_secs=[0-9]*$/) { + sub(/[0-9]*$/, seconds, $i); + if (line == "") + line = $i; + else + line = line " " $i; + } else if (line == "") { line = $i; - else + } else { line = line " " $i; + } if ($i == "-serial") { i++; line = line " file:" consolelog; - } - if ($i == "-kernel") { + } else if ($i == "-kernel") { i++; line = line " " image; } diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 8d3c99b35e06..6bf00a003d3d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -29,17 +29,21 @@ PATH=${KVM}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" +TORTURE_BUILDONLY= TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" TORTURE_KCONFIG_GDB_ARG="" TORTURE_BOOT_GDB_ARG="" TORTURE_QEMU_GDB_ARG="" +TORTURE_JITTER_START="" +TORTURE_JITTER_STOP="" TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu +TORTURE_MOD=rcutorture TORTURE_TRUST_MAKE="" resdir="" configs="" @@ -100,7 +104,7 @@ do TORTURE_BUILDONLY=1 ;; --configs|--config) - checkarg --configs "(list of config files)" "$#" "$2" '^[^/]\+$' '^--' + checkarg --configs "(list of config files)" "$#" "$2" '^[^/.a-z]\+$' '^--' configs="$configs $2" shift ;; @@ -116,7 +120,7 @@ do shift ;; --datestamp) - checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._-/]*$' '^--' + checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._/-]*$' '^--' ds=$2 shift ;; @@ -215,6 +219,7 @@ do --torture) checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--' TORTURE_SUITE=$2 + TORTURE_MOD="`echo $TORTURE_SUITE | sed -e 's/^\(lock\|rcu\|scf\)$/\1torture/'`" shift if test "$TORTURE_SUITE" = rcuscale || test "$TORTURE_SUITE" = refscale then @@ -381,6 +386,7 @@ TORTURE_QEMU_GDB_ARG="$TORTURE_QEMU_GDB_ARG"; export TORTURE_QEMU_GDB_ARG TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG +TORTURE_MOD="$TORTURE_MOD"; export TORTURE_MOD TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC @@ -399,12 +405,17 @@ echo Results directory: $resdir/$ds echo $scriptname $args touch $resdir/$ds/log echo $scriptname $args >> $resdir/$ds/log -echo ${TORTURE_SUITE} > $resdir/$ds/TORTURE_SUITE -pwd > $resdir/$ds/testid.txt +echo ${TORTURE_SUITE} > $resdir/$ds/torture_suite +echo Build directory: `pwd` > $resdir/$ds/testid.txt if test -d .git then + echo Current commit: `git rev-parse HEAD` >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo ' ---' Output of "'"git status"'": >> $resdir/$ds/testid.txt git status >> $resdir/$ds/testid.txt - git rev-parse HEAD >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo ' ---' Output of "'"git diff HEAD"'": >> $resdir/$ds/testid.txt git diff HEAD >> $resdir/$ds/testid.txt fi ___EOF___ @@ -434,8 +445,17 @@ function dump(first, pastlast, batchnum) print "echo ----Start batch " batchnum ": `date` | tee -a " rd "log"; print "needqemurun=" jn=1 + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + print "TORTURE_JITTER_START=\". jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] "\"; export TORTURE_JITTER_START"; + print "TORTURE_JITTER_STOP=\". jitterstop.sh " rd " \"; export TORTURE_JITTER_STOP" for (j = first; j < pastlast; j++) { - builddir=KVM "/b" j - first + 1 cpusr[jn] = cpus[j]; if (cfrep[cf[j]] == "") { cfr[jn] = cf[j]; @@ -444,15 +464,15 @@ function dump(first, pastlast, batchnum) cfrep[cf[j]]++; cfr[jn] = cf[j] "." cfrep[cf[j]]; } + builddir=rd cfr[jn] "/build"; if (cpusr[jn] > ncpus && ncpus != 0) ovf = "-ovf"; else ovf = ""; print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log"; - print "rm -f " builddir ".*"; - print "touch " builddir ".wait"; print "mkdir " rd cfr[jn] " || :"; - print "kvm-test-1-run.sh " CONFIGDIR cf[j], builddir, rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" + print "touch " builddir ".wait"; + print "kvm-test-1-run.sh " CONFIGDIR cf[j], rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log"; print "while test -f " builddir ".wait" print "do" @@ -461,23 +481,21 @@ function dump(first, pastlast, batchnum) print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date` | tee -a " rd "log"; jn++; } + print "runfiles=" for (j = 1; j < jn; j++) { - builddir=KVM "/b" j - print "rm -f " builddir ".ready" + builddir=rd cfr[j] "/build"; + if (TORTURE_BUILDONLY) + print "rm -f " builddir ".ready" + else + print "mv " builddir ".ready " builddir ".run" + print "runfiles=\"$runfiles " builddir ".run\"" + fi print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` | tee -a " rd "log"; print "\tneedqemurun=1" print "fi" } - njitter = 0; - split(jitter, ja); - if (ja[1] == -1 && ncpus == 0) - njitter = 1; - else if (ja[1] == -1) - njitter = ncpus; - else - njitter = ja[1]; if (TORTURE_BUILDONLY && njitter != 0) { njitter = 0; print "echo Build-only run, so suppressing jitter | tee -a " rd "log" @@ -488,19 +506,18 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\techo > " rd "jitter_pids" - for (j = 0; j < njitter; j++) { - print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" - print "\techo $! >> " rd "jitter_pids" - } - print "\twait" + print "\t$TORTURE_JITTER_START"; + print "\twhile ls $runfiles > /dev/null 2>&1" + print "\tdo" + print "\t\t:" + print "\tdone" + print "\t$TORTURE_JITTER_STOP"; print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" print "\techo ---- No kernel runs. `date` | tee -a " rd "log"; print "fi" for (j = 1; j < jn; j++) { - builddir=KVM "/b" j print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results: | tee -a " rd "log"; print "cat " rd cfr[j] "/kvm-test-1-run.sh.out | tee -a " rd "log"; } @@ -548,6 +565,18 @@ echo 'ret=$?' >> $T/script echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script echo 'exit $ret' >> $T/script +# Extract the tests and their batches from the script. +egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | + sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | + awk ' + /^----Start/ { + batchno = $3; + next; + } + { + print batchno, $1, $2 + }' > $T/batches + if test "$dryrun" = script then cat $T/script @@ -566,21 +595,14 @@ then exit 0 elif test "$dryrun" = batches then - # Extract the tests and their batches from the script. - egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | - sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | - awk ' - /^----Start/ { - batchno = $3; - next; - } - { - print batchno, $1, $2 - }' + cat $T/batches + exit 0 else - # Not a dryrun, so run the script. + # Not a dryrun. Record the batches and the number of CPUs, then run the script. bash $T/script ret=$? + cp $T/batches $resdir/$ds/batches + echo '#' cpus=$cpus >> $resdir/$ds/batches echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log exit $ret fi diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index ad7525b7ac29..56e2e1a42569 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -374,7 +374,7 @@ done if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi echo " --- " $scriptname $args diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index f2b20db9e296..98b6175e5aa0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -7,8 +7,8 @@ TREE07 TREE09 SRCU-N SRCU-P -SRCU-t -SRCU-u +SRCU-T +SRCU-U TINY01 TINY02 TASKS01 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T index d6557c38dfe4..d6557c38dfe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot index 238bfe3bd0cc..238bfe3bd0cc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U index 6bc24e99862f..6bc24e99862f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot index ce48c7b82673..ce48c7b82673 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 1c218944b1e9..64f864f1f361 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -4,3 +4,4 @@ rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 threadirqs +tree.use_softirq=0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 5adc6756792a..a8d94caf7d2f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutree.rcu_fanout_leaf=4 nohz_full=1-7 +rcutree.rcu_fanout_leaf=4 nohz_full=1-N diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot index 22478fd3a865..94d38445d393 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot @@ -1,3 +1,3 @@ rcupdate.rcu_self_test=1 rcutree.rcu_fanout_exact=1 -rcu_nocbs=0-7 +rcu_nocbs=all diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh index 0333e9b18522..ffbe15109f0d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh @@ -12,5 +12,5 @@ # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 rcuscale.shutdown=1 \ - rcuscale.verbose=1 + rcuscale.verbose=0 } diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh index 321e82641287..f81fa2c541a6 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh @@ -12,5 +12,5 @@ # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 refscale.shutdown=1 \ - refscale.verbose=1 + refscale.verbose=0 } diff --git a/tools/testing/selftests/resctrl/.gitignore b/tools/testing/selftests/resctrl/.gitignore new file mode 100644 index 000000000000..ab68442b6bc8 --- /dev/null +++ b/tools/testing/selftests/resctrl/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +resctrl_tests diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile index d585cc1948cc..6bcee2ec91a9 100644 --- a/tools/testing/selftests/resctrl/Makefile +++ b/tools/testing/selftests/resctrl/Makefile @@ -1,5 +1,5 @@ CC = $(CROSS_COMPILE)gcc -CFLAGS = -g -Wall +CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2 SRCS=$(wildcard *.c) OBJS=$(SRCS:.c=.o) diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README index 6e5a0ffa18e8..4b36b25b6ac0 100644 --- a/tools/testing/selftests/resctrl/README +++ b/tools/testing/selftests/resctrl/README @@ -46,8 +46,8 @@ ARGUMENTS Parameter '-h' shows usage information. usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits] - -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf - -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat + -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT default benchmark is builtin fill_buf + -t test list: run tests specified in the test list, e.g. -t mbm, mba, cmt, cat -n no_of_bits: run cache tests using specified no of bits in cache bit mask -p cpu_no: specify CPU number to run the test. 1 is default -h: help diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 38dbf4962e33..68ff856d36f0 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -111,7 +111,7 @@ static int get_llc_perf(unsigned long *llc_perf_miss) /* * Get LLC Occupancy as reported by RESCTRL FS - * For CQM, + * For CMT, * 1. If con_mon grp and mon grp given, then read from mon grp in * con_mon grp * 2. If only con_mon grp given, then read from con_mon grp @@ -182,7 +182,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) /* * Measure cache miss from perf. */ - if (!strcmp(param->resctrl_val, "cat")) { + if (!strncmp(param->resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = get_llc_perf(&llc_perf_miss); if (ret < 0) return ret; @@ -192,7 +192,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) /* * Measure llc occupancy from resctrl. */ - if (!strcmp(param->resctrl_val, "cqm")) { + if (!strncmp(param->resctrl_val, CMT_STR, sizeof(CMT_STR))) { ret = get_llc_occu_resctrl(&llc_occu_resc); if (ret < 0) return ret; @@ -234,7 +234,7 @@ int cat_val(struct resctrl_val_param *param) if (ret) return ret; - if ((strcmp(resctrl_val, "cat") == 0)) { + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = initialize_llc_perf(); if (ret) return ret; @@ -242,7 +242,7 @@ int cat_val(struct resctrl_val_param *param) /* Test runs until the callback setup() tells the test to stop. */ while (1) { - if (strcmp(resctrl_val, "cat") == 0) { + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; @@ -270,3 +270,45 @@ int cat_val(struct resctrl_val_param *param) return ret; } + +/* + * show_cache_info: show cache test result information + * @sum_llc_val: sum of LLC cache result data + * @no_of_bits: number of bits + * @cache_span: cache span in bytes for CMT or in lines for CAT + * @max_diff: max difference + * @max_diff_percent: max difference percentage + * @num_of_runs: number of runs + * @platform: show test information on this platform + * @cmt: CMT test or CAT test + * + * Return: 0 on success. non-zero on failure. + */ +int show_cache_info(unsigned long sum_llc_val, int no_of_bits, + unsigned long cache_span, unsigned long max_diff, + unsigned long max_diff_percent, unsigned long num_of_runs, + bool platform, bool cmt) +{ + unsigned long avg_llc_val = 0; + float diff_percent; + long avg_diff = 0; + int ret; + + avg_llc_val = sum_llc_val / (num_of_runs - 1); + avg_diff = (long)abs(cache_span - avg_llc_val); + diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100; + + ret = platform && abs((int)diff_percent) > max_diff_percent && + (cmt ? (abs(avg_diff) > max_diff) : true); + + ksft_print_msg("%s Check cache miss rate within %d%%\n", + ret ? "Fail:" : "Pass:", max_diff_percent); + + ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); + ksft_print_msg("Number of bits: %d\n", no_of_bits); + ksft_print_msg("Average LLC val: %lu\n", avg_llc_val); + ksft_print_msg("Cache span (%s): %lu\n", cmt ? "bytes" : "lines", + cache_span); + + return ret; +} diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 5da43767b973..cd4f68388e0f 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -17,10 +17,10 @@ #define MAX_DIFF_PERCENT 4 #define MAX_DIFF 1000000 -int count_of_bits; -char cbm_mask[256]; -unsigned long long_mask; -unsigned long cache_size; +static int count_of_bits; +static char cbm_mask[256]; +static unsigned long long_mask; +static unsigned long cache_size; /* * Change schemata. Write schemata to specified @@ -52,27 +52,6 @@ static int cat_setup(int num, ...) return ret; } -static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits, - unsigned long span) -{ - unsigned long allocated_cache_lines = span / 64; - unsigned long avg_llc_perf_miss = 0; - float diff_percent; - - avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1); - diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) / - allocated_cache_lines * 100; - - printf("%sok CAT: cache miss rate within %d%%\n", - !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ? - "not " : "", MAX_DIFF_PERCENT); - tests_run++; - printf("# Percent diff=%d\n", abs((int)diff_percent)); - printf("# Number of bits: %d\n", no_of_bits); - printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss); - printf("# Allocated cache lines: %lu\n", allocated_cache_lines); -} - static int check_results(struct resctrl_val_param *param) { char *token_array[8], temp[512]; @@ -80,7 +59,7 @@ static int check_results(struct resctrl_val_param *param) int runs = 0, no_of_bits = 0; FILE *fp; - printf("# Checking for pass/fail\n"); + ksft_print_msg("Checking for pass/fail\n"); fp = fopen(param->filename, "r"); if (!fp) { perror("# Cannot open file"); @@ -108,9 +87,9 @@ static int check_results(struct resctrl_val_param *param) fclose(fp); no_of_bits = count_bits(param->mask); - show_cache_info(sum_llc_perf_miss, no_of_bits, param->span); - - return 0; + return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span / 64, + MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS, + !is_amd, false); } void cat_test_cleanup(void) @@ -132,11 +111,8 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) if (ret) return ret; - if (!validate_resctrl_feature_request("cat")) - return -1; - /* Get default cbm mask for L3/L2 cache */ - ret = get_cbm_mask(cache_type); + ret = get_cbm_mask(cache_type, cbm_mask); if (ret) return ret; @@ -146,15 +122,18 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) ret = get_cache_size(cpu_no, cache_type, &cache_size); if (ret) return ret; - printf("cache size :%lu\n", cache_size); + ksft_print_msg("Cache size :%lu\n", cache_size); /* Get max number of bits from default-cabm mask */ count_of_bits = count_bits(long_mask); - if (n < 1 || n > count_of_bits - 1) { - printf("Invalid input value for no_of_bits n!\n"); - printf("Please Enter value in range 1 to %d\n", - count_of_bits - 1); + if (!n) + n = count_of_bits / 2; + + if (n > count_of_bits - 1) { + ksft_print_msg("Invalid input value for no_of_bits n!\n"); + ksft_print_msg("Please enter value in range 1 to %d\n", + count_of_bits - 1); return -1; } @@ -164,7 +143,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) return -1; struct resctrl_val_param param = { - .resctrl_val = "cat", + .resctrl_val = CAT_STR, .cpu_no = cpu_no, .mum_resctrlfs = 0, .setup = cat_setup, diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cmt_test.c index c8756152bd61..8968e36db99d 100644 --- a/tools/testing/selftests/resctrl/cqm_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Cache Monitoring Technology (CQM) test + * Cache Monitoring Technology (CMT) test * * Copyright (C) 2018 Intel Corporation * @@ -11,17 +11,17 @@ #include "resctrl.h" #include <unistd.h> -#define RESULT_FILE_NAME "result_cqm" +#define RESULT_FILE_NAME "result_cmt" #define NUM_OF_RUNS 5 #define MAX_DIFF 2000000 #define MAX_DIFF_PERCENT 15 -int count_of_bits; -char cbm_mask[256]; -unsigned long long_mask; -unsigned long cache_size; +static int count_of_bits; +static char cbm_mask[256]; +static unsigned long long_mask; +static unsigned long cache_size; -static int cqm_setup(int num, ...) +static int cmt_setup(int num, ...) { struct resctrl_val_param *p; va_list param; @@ -39,38 +39,6 @@ static int cqm_setup(int num, ...) return 0; } -static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, - unsigned long span) -{ - unsigned long avg_llc_occu_resc = 0; - float diff_percent; - long avg_diff = 0; - bool res; - - avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); - avg_diff = (long)abs(span - avg_llc_occu_resc); - - diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; - - if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) || - (abs(avg_diff) <= MAX_DIFF)) - res = true; - else - res = false; - - printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not", - MAX_DIFF, (int)MAX_DIFF_PERCENT); - - printf("# diff: %ld\n", avg_diff); - printf("# percent diff=%d\n", abs((int)diff_percent)); - printf("# Results are displayed in (Bytes)\n"); - printf("# Number of bits: %d\n", no_of_bits); - printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); - printf("# llc_occu_exp (span): %lu\n", span); - - tests_run++; -} - static int check_results(struct resctrl_val_param *param, int no_of_bits) { char *token_array[8], temp[512]; @@ -78,7 +46,7 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) int runs = 0; FILE *fp; - printf("# checking for pass/fail\n"); + ksft_print_msg("Checking for pass/fail\n"); fp = fopen(param->filename, "r"); if (!fp) { perror("# Error in opening file\n"); @@ -86,7 +54,7 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) return errno; } - while (fgets(temp, 1024, fp)) { + while (fgets(temp, sizeof(temp), fp)) { char *token = strtok(temp, ":\t"); int fields = 0; @@ -101,17 +69,18 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) runs++; } fclose(fp); - show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); - return 0; + return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span, + MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS, + true, true); } -void cqm_test_cleanup(void) +void cmt_test_cleanup(void) { remove(RESULT_FILE_NAME); } -int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) +int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd) { int ret, mum_resctrlfs; @@ -122,10 +91,10 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) if (ret) return ret; - if (!validate_resctrl_feature_request("cqm")) + if (!validate_resctrl_feature_request(CMT_STR)) return -1; - ret = get_cbm_mask("L3"); + ret = get_cbm_mask("L3", cbm_mask); if (ret) return ret; @@ -134,18 +103,18 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) ret = get_cache_size(cpu_no, "L3", &cache_size); if (ret) return ret; - printf("cache size :%lu\n", cache_size); + ksft_print_msg("Cache size :%lu\n", cache_size); count_of_bits = count_bits(long_mask); if (n < 1 || n > count_of_bits) { - printf("Invalid input value for numbr_of_bits n!\n"); - printf("Please Enter value in range 1 to %d\n", count_of_bits); + ksft_print_msg("Invalid input value for numbr_of_bits n!\n"); + ksft_print_msg("Please enter value in range 1 to %d\n", count_of_bits); return -1; } struct resctrl_val_param param = { - .resctrl_val = "cqm", + .resctrl_val = CMT_STR, .ctrlgrp = "c1", .mongrp = "m1", .cpu_no = cpu_no, @@ -154,7 +123,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) .mask = ~(long_mask << n) & long_mask, .span = cache_size * n / count_of_bits, .num_of_runs = 0, - .setup = cqm_setup, + .setup = cmt_setup, }; if (strcmp(benchmark_cmd[0], "fill_buf") == 0) @@ -170,7 +139,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) if (ret) return ret; - cqm_test_cleanup(); + cmt_test_cleanup(); return 0; } diff --git a/tools/testing/selftests/resctrl/config b/tools/testing/selftests/resctrl/config new file mode 100644 index 000000000000..8d9f2deb56ed --- /dev/null +++ b/tools/testing/selftests/resctrl/config @@ -0,0 +1,2 @@ +CONFIG_X86_CPU_RESCTRL=y +CONFIG_PROC_CPU_RESCTRL=y diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c index 79c611c99a3d..51e5cf22632f 100644 --- a/tools/testing/selftests/resctrl/fill_buf.c +++ b/tools/testing/selftests/resctrl/fill_buf.c @@ -115,7 +115,7 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr, while (1) { ret = fill_one_span_read(start_ptr, end_ptr); - if (!strcmp(resctrl_val, "cat")) + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) break; } @@ -134,7 +134,7 @@ static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr, { while (1) { fill_one_span_write(start_ptr, end_ptr); - if (!strcmp(resctrl_val, "cat")) + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) break; } diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 7bf8eaa6204b..1a1bdb6180cf 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -12,7 +12,7 @@ #define RESULT_FILE_NAME "result_mba" #define NUM_OF_RUNS 5 -#define MAX_DIFF 300 +#define MAX_DIFF_PERCENT 5 #define ALLOCATION_MAX 100 #define ALLOCATION_MIN 10 #define ALLOCATION_STEP 10 @@ -56,13 +56,14 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) int allocation, runs; bool failed = false; - printf("# Results are displayed in (MB)\n"); + ksft_print_msg("Results are displayed in (MB)\n"); /* Memory bandwidth from 100% down to 10% */ for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; allocation++) { unsigned long avg_bw_imc, avg_bw_resc; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; - unsigned long avg_diff; + int avg_diff_per; + float avg_diff; /* * The first run is discarded due to inaccurate value from @@ -76,23 +77,26 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1); avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1); - avg_diff = labs((long)(avg_bw_resc - avg_bw_imc)); - - printf("%sok MBA schemata percentage %u smaller than %d %%\n", - avg_diff > MAX_DIFF ? "not " : "", - ALLOCATION_MAX - ALLOCATION_STEP * allocation, - MAX_DIFF); - tests_run++; - printf("# avg_diff: %lu\n", avg_diff); - printf("# avg_bw_imc: %lu\n", avg_bw_imc); - printf("# avg_bw_resc: %lu\n", avg_bw_resc); - if (avg_diff > MAX_DIFF) + avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc; + avg_diff_per = (int)(avg_diff * 100); + + ksft_print_msg("%s Check MBA diff within %d%% for schemata %u\n", + avg_diff_per > MAX_DIFF_PERCENT ? + "Fail:" : "Pass:", + MAX_DIFF_PERCENT, + ALLOCATION_MAX - ALLOCATION_STEP * allocation); + + ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per); + ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); + ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); + if (avg_diff_per > MAX_DIFF_PERCENT) failed = true; } - printf("%sok schemata change using MBA%s\n", failed ? "not " : "", - failed ? " # at least one test failed" : ""); - tests_run++; + ksft_print_msg("%s Check schemata change using MBA\n", + failed ? "Fail:" : "Pass:"); + if (failed) + ksft_print_msg("At least one test failed\n"); } static int check_results(void) @@ -141,7 +145,7 @@ void mba_test_cleanup(void) int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) { struct resctrl_val_param param = { - .resctrl_val = "mba", + .resctrl_val = MBA_STR, .ctrlgrp = "c1", .mongrp = "m1", .cpu_no = cpu_no, @@ -154,9 +158,6 @@ int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) remove(RESULT_FILE_NAME); - if (!validate_resctrl_feature_request("mba")) - return -1; - ret = resctrl_val(benchmark_cmd, ¶m); if (ret) return ret; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 4700f7453f81..8392e5c55ed0 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -11,16 +11,16 @@ #include "resctrl.h" #define RESULT_FILE_NAME "result_mbm" -#define MAX_DIFF 300 +#define MAX_DIFF_PERCENT 5 #define NUM_OF_RUNS 5 -static void +static int show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) { unsigned long avg_bw_imc = 0, avg_bw_resc = 0; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; - long avg_diff = 0; - int runs; + int runs, ret, avg_diff_per; + float avg_diff = 0; /* * Discard the first value which is inaccurate due to monitoring setup @@ -33,15 +33,18 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) avg_bw_imc = sum_bw_imc / 4; avg_bw_resc = sum_bw_resc / 4; - avg_diff = avg_bw_resc - avg_bw_imc; - - printf("%sok MBM: diff within %d%%\n", - labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF); - tests_run++; - printf("# avg_diff: %lu\n", labs(avg_diff)); - printf("# Span (MB): %d\n", span); - printf("# avg_bw_imc: %lu\n", avg_bw_imc); - printf("# avg_bw_resc: %lu\n", avg_bw_resc); + avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc; + avg_diff_per = (int)(avg_diff * 100); + + ret = avg_diff_per > MAX_DIFF_PERCENT; + ksft_print_msg("%s Check MBM diff within %d%%\n", + ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT); + ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per); + ksft_print_msg("Span (MB): %d\n", span); + ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); + ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); + + return ret; } static int check_results(int span) @@ -49,10 +52,10 @@ static int check_results(int span) unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS]; char temp[1024], *token_array[8]; char output[] = RESULT_FILE_NAME; - int runs; + int runs, ret; FILE *fp; - printf("# Checking for pass/fail\n"); + ksft_print_msg("Checking for pass/fail\n"); fp = fopen(output, "r"); if (!fp) { @@ -76,11 +79,11 @@ static int check_results(int span) runs++; } - show_bw_info(bw_imc, bw_resc, span); + ret = show_bw_info(bw_imc, bw_resc, span); fclose(fp); - return 0; + return ret; } static int mbm_setup(int num, ...) @@ -114,7 +117,7 @@ void mbm_test_cleanup(void) int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) { struct resctrl_val_param param = { - .resctrl_val = "mbm", + .resctrl_val = MBM_STR, .ctrlgrp = "c1", .mongrp = "m1", .span = span, @@ -128,9 +131,6 @@ int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) remove(RESULT_FILE_NAME); - if (!validate_resctrl_feature_request("mbm")) - return -1; - ret = resctrl_val(benchmark_cmd, ¶m); if (ret) return ret; diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 39bf59c6b9c5..1ad10c47e31d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -23,11 +23,16 @@ #include <sys/eventfd.h> #include <asm/unistd.h> #include <linux/perf_event.h> +#include "../kselftest.h" #define MB (1024 * 1024) #define RESCTRL_PATH "/sys/fs/resctrl" #define PHYS_ID_PATH "/sys/devices/system/cpu/cpu" #define CBM_MASK_PATH "/sys/fs/resctrl/info" +#define L3_PATH "/sys/fs/resctrl/info/L3" +#define MB_PATH "/sys/fs/resctrl/info/MB" +#define L3_MON_PATH "/sys/fs/resctrl/info/L3_MON" +#define L3_MON_FEATURES_PATH "/sys/fs/resctrl/info/L3_MON/mon_features" #define PARENT_EXIT(err_msg) \ do { \ @@ -62,11 +67,15 @@ struct resctrl_val_param { int (*setup)(int num, ...); }; -pid_t bm_pid, ppid; -int tests_run; +#define MBM_STR "mbm" +#define MBA_STR "mba" +#define CMT_STR "cmt" +#define CAT_STR "cat" -char llc_occup_path[1024]; -bool is_amd; +extern pid_t bm_pid, ppid; + +extern char llc_occup_path[1024]; +extern bool is_amd; bool check_resctrlfs_support(void); int filter_dmesg(void); @@ -74,7 +83,7 @@ int remount_resctrlfs(bool mum_resctrlfs); int get_resource_id(int cpu_no, int *resource_id); int umount_resctrlfs(void); int validate_bw_report_request(char *bw_report); -bool validate_resctrl_feature_request(char *resctrl_val); +bool validate_resctrl_feature_request(const char *resctrl_val); char *fgrep(FILE *inf, const char *str); int taskset_benchmark(pid_t bm_pid, int cpu_no); void run_benchmark(int signum, siginfo_t *info, void *ucontext); @@ -92,16 +101,20 @@ void tests_cleanup(void); void mbm_test_cleanup(void); int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd); void mba_test_cleanup(void); -int get_cbm_mask(char *cache_type); +int get_cbm_mask(char *cache_type, char *cbm_mask); int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size); void ctrlc_handler(int signum, siginfo_t *info, void *ptr); int cat_val(struct resctrl_val_param *param); void cat_test_cleanup(void); int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type); -int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd); +int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd); unsigned int count_bits(unsigned long n); -void cqm_test_cleanup(void); +void cmt_test_cleanup(void); int get_core_sibling(int cpu_no); int measure_cache_vals(struct resctrl_val_param *param, int bm_pid); +int show_cache_info(unsigned long sum_llc_val, int no_of_bits, + unsigned long cache_span, unsigned long max_diff, + unsigned long max_diff_percent, unsigned long num_of_runs, + bool platform, bool cmt); #endif /* RESCTRL_H */ diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 425cc85ac883..f51b5fc066a3 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -37,10 +37,10 @@ void detect_amd(void) static void cmd_help(void) { printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n"); - printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM"); - printf("\t default benchmark is builtin fill_buf\n"); + printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT\n"); + printf("\t default benchmark is builtin fill_buf\n"); printf("\t-t test list: run tests specified in the test list, "); - printf("e.g. -t mbm, mba, cqm, cat\n"); + printf("e.g. -t mbm, mba, cmt, cat\n"); printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n"); printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n"); printf("\t-h: help\n"); @@ -50,17 +50,88 @@ void tests_cleanup(void) { mbm_test_cleanup(); mba_test_cleanup(); - cqm_test_cleanup(); + cmt_test_cleanup(); + cat_test_cleanup(); +} + +static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span, + int cpu_no, char *bw_report) +{ + int res; + + ksft_print_msg("Starting MBM BW change ...\n"); + + if (!validate_resctrl_feature_request(MBM_STR)) { + ksft_test_result_skip("Hardware does not support MBM or MBM is disabled\n"); + return; + } + + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", MBA_STR); + res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); + ksft_test_result(!res, "MBM: bw change\n"); + mbm_test_cleanup(); +} + +static void run_mba_test(bool has_ben, char **benchmark_cmd, int span, + int cpu_no, char *bw_report) +{ + int res; + + ksft_print_msg("Starting MBA Schemata change ...\n"); + + if (!validate_resctrl_feature_request(MBA_STR)) { + ksft_test_result_skip("Hardware does not support MBA or MBA is disabled\n"); + return; + } + + if (!has_ben) + sprintf(benchmark_cmd[1], "%d", span); + res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); + ksft_test_result(!res, "MBA: schemata change\n"); + mba_test_cleanup(); +} + +static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no) +{ + int res; + + ksft_print_msg("Starting CMT test ...\n"); + if (!validate_resctrl_feature_request(CMT_STR)) { + ksft_test_result_skip("Hardware does not support CMT or CMT is disabled\n"); + return; + } + + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", CMT_STR); + res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd); + ksft_test_result(!res, "CMT: test\n"); + cmt_test_cleanup(); +} + +static void run_cat_test(int cpu_no, int no_of_bits) +{ + int res; + + ksft_print_msg("Starting CAT test ...\n"); + + if (!validate_resctrl_feature_request(CAT_STR)) { + ksft_test_result_skip("Hardware does not support CAT or CAT is disabled\n"); + return; + } + + res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); + ksft_test_result(!res, "CAT: test\n"); cat_test_cleanup(); } int main(int argc, char **argv) { - bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true; - int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5; + bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true; + int c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0; char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; - int ben_ind, ben_count; + int ben_ind, ben_count, tests = 0; bool cat_test = true; for (i = 0; i < argc; i++) { @@ -73,7 +144,7 @@ int main(int argc, char **argv) } } - while ((c = getopt(argc_new, argv, "ht:b:")) != -1) { + while ((c = getopt(argc_new, argv, "ht:b:n:p:")) != -1) { char *token; switch (c) { @@ -82,17 +153,21 @@ int main(int argc, char **argv) mbm_test = false; mba_test = false; - cqm_test = false; + cmt_test = false; cat_test = false; while (token) { - if (!strcmp(token, "mbm")) { + if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) { mbm_test = true; - } else if (!strcmp(token, "mba")) { + tests++; + } else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) { mba_test = true; - } else if (!strcmp(token, "cqm")) { - cqm_test = true; - } else if (!strcmp(token, "cat")) { + tests++; + } else if (!strncmp(token, CMT_STR, sizeof(CMT_STR))) { + cmt_test = true; + tests++; + } else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) { cat_test = true; + tests++; } else { printf("invalid argument\n"); @@ -106,6 +181,10 @@ int main(int argc, char **argv) break; case 'n': no_of_bits = atoi(optarg); + if (no_of_bits <= 0) { + printf("Bail out! invalid argument for no_of_bits\n"); + return -1; + } break; case 'h': cmd_help(); @@ -118,7 +197,7 @@ int main(int argc, char **argv) } } - printf("TAP version 13\n"); + ksft_print_header(); /* * Typically we need root privileges, because: @@ -126,7 +205,7 @@ int main(int argc, char **argv) * 2. We execute perf commands */ if (geteuid() != 0) - printf("# WARNING: not running as root, tests may fail.\n"); + return ksft_exit_fail_msg("Not running as root, abort testing.\n"); /* Detect AMD vendor */ detect_amd(); @@ -155,48 +234,26 @@ int main(int argc, char **argv) sprintf(bw_report, "reads"); sprintf(bm_type, "fill_buf"); - check_resctrlfs_support(); + if (!check_resctrlfs_support()) + return ksft_exit_fail_msg("resctrl FS does not exist\n"); + filter_dmesg(); - if (!is_amd && mbm_test) { - printf("# Starting MBM BW change ...\n"); - if (!has_ben) - sprintf(benchmark_cmd[5], "%s", "mba"); - res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); - printf("%sok MBM: bw change\n", res ? "not " : ""); - mbm_test_cleanup(); - tests_run++; - } + ksft_set_plan(tests ? : 4); - if (!is_amd && mba_test) { - printf("# Starting MBA Schemata change ...\n"); - if (!has_ben) - sprintf(benchmark_cmd[1], "%d", span); - res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); - printf("%sok MBA: schemata change\n", res ? "not " : ""); - mba_test_cleanup(); - tests_run++; - } + if (!is_amd && mbm_test) + run_mbm_test(has_ben, benchmark_cmd, span, cpu_no, bw_report); - if (cqm_test) { - printf("# Starting CQM test ...\n"); - if (!has_ben) - sprintf(benchmark_cmd[5], "%s", "cqm"); - res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); - printf("%sok CQM: test\n", res ? "not " : ""); - cqm_test_cleanup(); - tests_run++; - } + if (!is_amd && mba_test) + run_mba_test(has_ben, benchmark_cmd, span, cpu_no, bw_report); - if (cat_test) { - printf("# Starting CAT test ...\n"); - res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); - printf("%sok CAT: test\n", res ? "not " : ""); - tests_run++; - cat_test_cleanup(); - } + if (cmt_test) + run_cmt_test(has_ben, benchmark_cmd, cpu_no); + + if (cat_test) + run_cat_test(cpu_no, no_of_bits); - printf("1..%d\n", tests_run); + umount_resctrlfs(); - return 0; + return ksft_exit_pass(); } diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 520fea3606d1..95224345c78e 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -221,8 +221,8 @@ static int read_from_imc_dir(char *imc_dir, int count) */ static int num_of_imcs(void) { + char imc_dir[512], *temp; unsigned int count = 0; - char imc_dir[512]; struct dirent *ep; int ret; DIR *dp; @@ -230,7 +230,25 @@ static int num_of_imcs(void) dp = opendir(DYN_PMU_PATH); if (dp) { while ((ep = readdir(dp))) { - if (strstr(ep->d_name, UNCORE_IMC)) { + temp = strstr(ep->d_name, UNCORE_IMC); + if (!temp) + continue; + + /* + * imc counters are named as "uncore_imc_<n>", hence + * increment the pointer to point to <n>. Note that + * sizeof(UNCORE_IMC) would count for null character as + * well and hence the last underscore character in + * uncore_imc'_' need not be counted. + */ + temp = temp + sizeof(UNCORE_IMC); + + /* + * Some directories under "DYN_PMU_PATH" could have + * names like "uncore_imc_free_running", hence, check if + * first character is a numerical digit or not. + */ + if (temp[0] >= '0' && temp[0] <= '9') { sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH, ep->d_name); ret = read_from_imc_dir(imc_dir, count); @@ -282,9 +300,9 @@ static int initialize_mem_bw_imc(void) * Memory B/W utilized by a process on a socket can be calculated using * iMC counters. Perf events are used to read these counters. * - * Return: >= 0 on success. < 0 on failure. + * Return: = 0 on success. < 0 on failure. */ -static float get_mem_bw_imc(int cpu_no, char *bw_report) +static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc) { float reads, writes, of_mul_read, of_mul_write; int imc, j, ret; @@ -355,13 +373,18 @@ static float get_mem_bw_imc(int cpu_no, char *bw_report) close(imc_counters_config[imc][WRITE].fd); } - if (strcmp(bw_report, "reads") == 0) - return reads; + if (strcmp(bw_report, "reads") == 0) { + *bw_imc = reads; + return 0; + } - if (strcmp(bw_report, "writes") == 0) - return writes; + if (strcmp(bw_report, "writes") == 0) { + *bw_imc = writes; + return 0; + } - return (reads + writes); + *bw_imc = reads + writes; + return 0; } void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id) @@ -397,10 +420,10 @@ static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp, return; } - if (strcmp(resctrl_val, "mbm") == 0) + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) set_mbm_path(ctrlgrp, mongrp, resource_id); - if ((strcmp(resctrl_val, "mba") == 0)) { + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { if (ctrlgrp) sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, ctrlgrp, resource_id); @@ -420,9 +443,8 @@ static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp, * 1. If con_mon grp is given, then read from it * 2. If con_mon grp is not given, then read from root con_mon grp */ -static unsigned long get_mem_bw_resctrl(void) +static int get_mem_bw_resctrl(unsigned long *mbm_total) { - unsigned long mbm_total = 0; FILE *fp; fp = fopen(mbm_total_path, "r"); @@ -431,7 +453,7 @@ static unsigned long get_mem_bw_resctrl(void) return -1; } - if (fscanf(fp, "%lu", &mbm_total) <= 0) { + if (fscanf(fp, "%lu", mbm_total) <= 0) { perror("Could not get mbm local bytes"); fclose(fp); @@ -439,7 +461,7 @@ static unsigned long get_mem_bw_resctrl(void) } fclose(fp); - return mbm_total; + return 0; } pid_t bm_pid, ppid; @@ -449,7 +471,7 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) kill(bm_pid, SIGKILL); umount_resctrlfs(); tests_cleanup(); - printf("Ending\n\n"); + ksft_print_msg("Ending\n\n"); exit(EXIT_SUCCESS); } @@ -492,7 +514,7 @@ static int print_results_bw(char *filename, int bm_pid, float bw_imc, return 0; } -static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num) +static void set_cmt_path(const char *ctrlgrp, const char *mongrp, char sock_num) { if (strlen(ctrlgrp) && strlen(mongrp)) sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH, @@ -512,7 +534,7 @@ static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num) * @ctrlgrp: Name of the control monitor group (con_mon grp) * @mongrp: Name of the monitor group (mon grp) * @cpu_no: CPU number that the benchmark PID is binded to - * @resctrl_val: Resctrl feature (Eg: cat, cqm.. etc) + * @resctrl_val: Resctrl feature (Eg: cat, cmt.. etc) */ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, int cpu_no, char *resctrl_val) @@ -524,14 +546,15 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, return; } - if (strcmp(resctrl_val, "cqm") == 0) - set_cqm_path(ctrlgrp, mongrp, resource_id); + if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) + set_cmt_path(ctrlgrp, mongrp, resource_id); } static int measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start) { - unsigned long bw_imc, bw_resc, bw_resc_end; + unsigned long bw_resc, bw_resc_end; + float bw_imc; int ret; /* @@ -541,13 +564,13 @@ measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start) * Compare the two values to validate resctrl value. * It takes 1sec to measure the data. */ - bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report); - if (bw_imc <= 0) - return bw_imc; + ret = get_mem_bw_imc(param->cpu_no, param->bw_report, &bw_imc); + if (ret < 0) + return ret; - bw_resc_end = get_mem_bw_resctrl(); - if (bw_resc_end <= 0) - return bw_resc_end; + ret = get_mem_bw_resctrl(&bw_resc_end); + if (ret < 0) + return ret; bw_resc = (bw_resc_end - *bw_resc_start) / MB; ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc); @@ -579,8 +602,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) if (strcmp(param->filename, "") == 0) sprintf(param->filename, "stdio"); - if ((strcmp(resctrl_val, "mba")) == 0 || - (strcmp(resctrl_val, "mbm")) == 0) { + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) || + !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { ret = validate_bw_report_request(param->bw_report); if (ret) return ret; @@ -645,7 +668,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) PARENT_EXIT("Child is done"); } - printf("# benchmark PID: %d\n", bm_pid); + ksft_print_msg("Benchmark PID: %d\n", bm_pid); /* * Register CTRL-C handler for parent, as it has to kill benchmark @@ -674,15 +697,15 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) if (ret) goto out; - if ((strcmp(resctrl_val, "mbm") == 0) || - (strcmp(resctrl_val, "mba") == 0)) { + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || + !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { ret = initialize_mem_bw_imc(); if (ret) goto out; initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); - } else if (strcmp(resctrl_val, "cqm") == 0) + } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); @@ -710,8 +733,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) /* Test runs until the callback setup() tells the test to stop. */ while (1) { - if ((strcmp(resctrl_val, "mbm") == 0) || - (strcmp(resctrl_val, "mba") == 0)) { + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || + !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; @@ -721,7 +744,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) ret = measure_vals(param, &bw_resc_start); if (ret) break; - } else if (strcmp(resctrl_val, "cqm") == 0) { + } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 19c0ec4045a4..5f5a166ade60 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -10,8 +10,6 @@ */ #include "resctrl.h" -int tests_run; - static int find_resctrl_mount(char *buffer) { FILE *mounts; @@ -49,8 +47,6 @@ static int find_resctrl_mount(char *buffer) return -ENOENT; } -char cbm_mask[256]; - /* * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl * @mum_resctrlfs: Should the resctrl FS be remounted? @@ -70,28 +66,25 @@ int remount_resctrlfs(bool mum_resctrlfs) if (ret) strcpy(mountpoint, RESCTRL_PATH); - if (!ret && mum_resctrlfs && umount(mountpoint)) { - printf("not ok unmounting \"%s\"\n", mountpoint); - perror("# umount"); - tests_run++; - } + if (!ret && mum_resctrlfs && umount(mountpoint)) + ksft_print_msg("Fail: unmounting \"%s\"\n", mountpoint); if (!ret && !mum_resctrlfs) return 0; + ksft_print_msg("Mounting resctrl to \"%s\"\n", RESCTRL_PATH); ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL); - printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "", - RESCTRL_PATH); if (ret) perror("# mount"); - tests_run++; - return ret; } int umount_resctrlfs(void) { + if (find_resctrl_mount(NULL)) + return 0; + if (umount(RESCTRL_PATH)) { perror("# Unable to umount resctrl"); @@ -205,16 +198,18 @@ int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size) /* * get_cbm_mask - Get cbm mask for given cache * @cache_type: Cache level L2/L3 - * - * Mask is stored in cbm_mask which is global variable. + * @cbm_mask: cbm_mask returned as a string * * Return: = 0 on success, < 0 on failure. */ -int get_cbm_mask(char *cache_type) +int get_cbm_mask(char *cache_type, char *cbm_mask) { char cbm_mask_path[1024]; FILE *fp; + if (!cbm_mask) + return -1; + sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type); fp = fopen(cbm_mask_path, "r"); @@ -268,7 +263,7 @@ int get_core_sibling(int cpu_no) while (token) { sibling_cpu_no = atoi(token); /* Skipping core 0 as we don't want to run test on core 0 */ - if (sibling_cpu_no != 0) + if (sibling_cpu_no != 0 && sibling_cpu_no != cpu_no) break; token = strtok(NULL, "-,"); } @@ -334,7 +329,7 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext) operation = atoi(benchmark_cmd[4]); sprintf(resctrl_val, "%s", benchmark_cmd[5]); - if (strcmp(resctrl_val, "cqm") != 0) + if (strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) buffer_span = span * MB; else buffer_span = span; @@ -458,9 +453,9 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, if (ret) goto out; - /* Create mon grp and write pid into it for "mbm" and "cqm" test */ - if ((strcmp(resctrl_val, "cqm") == 0) || - (strcmp(resctrl_val, "mbm") == 0)) { + /* Create mon grp and write pid into it for "mbm" and "cmt" test */ + if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)) || + !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { if (strlen(mongrp)) { sprintf(monitorgroup_p, "%s/mon_groups", controlgroup); sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp); @@ -477,13 +472,10 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, } out: - printf("%sok writing benchmark parameters to resctrl FS\n", - ret ? "not " : ""); + ksft_print_msg("Writing benchmark parameters to resctrl FS\n"); if (ret) perror("# writing to resctrlfs"); - tests_run++; - return ret; } @@ -505,13 +497,13 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) int resource_id, ret = 0; FILE *fp; - if ((strcmp(resctrl_val, "mba") != 0) && - (strcmp(resctrl_val, "cat") != 0) && - (strcmp(resctrl_val, "cqm") != 0)) + if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) && + strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) && + strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) return -ENOENT; if (!schemata) { - printf("# Skipping empty schemata update\n"); + ksft_print_msg("Skipping empty schemata update\n"); return -1; } @@ -528,9 +520,10 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) else sprintf(controlgroup, "%s/schemata", RESCTRL_PATH); - if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm")) + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) || + !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata); - if (strcmp(resctrl_val, "mba") == 0) + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata); fp = fopen(controlgroup, "w"); @@ -551,10 +544,9 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) fclose(fp); out: - printf("%sok Write schema \"%s\" to resctrl FS%s%s\n", - ret ? "not " : "", schema, ret ? " # " : "", - ret ? reason : ""); - tests_run++; + ksft_print_msg("Write schema \"%s\" to resctrl FS%s%s\n", + schema, ret ? " # " : "", + ret ? reason : ""); return ret; } @@ -578,18 +570,20 @@ bool check_resctrlfs_support(void) fclose(inf); - printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not "); - tests_run++; + ksft_print_msg("%s Check kernel supports resctrl filesystem\n", + ret ? "Pass:" : "Fail:"); + + if (!ret) + return ret; dp = opendir(RESCTRL_PATH); - printf("%sok resctrl mountpoint \"%s\" exists\n", - dp ? "" : "not ", RESCTRL_PATH); + ksft_print_msg("%s Check resctrl mountpoint \"%s\" exists\n", + dp ? "Pass:" : "Fail:", RESCTRL_PATH); if (dp) closedir(dp); - tests_run++; - printf("# resctrl filesystem %s mounted\n", - find_resctrl_mount(NULL) ? "not" : "is"); + ksft_print_msg("resctrl filesystem %s mounted\n", + find_resctrl_mount(NULL) ? "not" : "is"); return ret; } @@ -615,26 +609,56 @@ char *fgrep(FILE *inf, const char *str) * validate_resctrl_feature_request - Check if requested feature is valid. * @resctrl_val: Requested feature * - * Return: 0 on success, non-zero on failure + * Return: True if the feature is supported, else false */ -bool validate_resctrl_feature_request(char *resctrl_val) +bool validate_resctrl_feature_request(const char *resctrl_val) { - FILE *inf = fopen("/proc/cpuinfo", "r"); + struct stat statbuf; bool found = false; char *res; + FILE *inf; - if (!inf) + if (!resctrl_val) return false; - res = fgrep(inf, "flags"); - - if (res) { - char *s = strchr(res, ':'); + if (remount_resctrlfs(false)) + return false; - found = s && !strstr(s, resctrl_val); - free(res); + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { + if (!stat(L3_PATH, &statbuf)) + return true; + } else if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { + if (!stat(MB_PATH, &statbuf)) + return true; + } else if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || + !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { + if (!stat(L3_MON_PATH, &statbuf)) { + inf = fopen(L3_MON_FEATURES_PATH, "r"); + if (!inf) + return false; + + if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { + res = fgrep(inf, "llc_occupancy"); + if (res) { + found = true; + free(res); + } + } + + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { + res = fgrep(inf, "mbm_total_bytes"); + if (res) { + free(res); + res = fgrep(inf, "mbm_local_bytes"); + if (res) { + found = true; + free(res); + } + } + } + fclose(inf); + } } - fclose(inf); return found; } @@ -671,9 +695,9 @@ int filter_dmesg(void) while (fgets(line, 1024, fp)) { if (strstr(line, "intel_rdt:")) - printf("# dmesg: %s", line); + ksft_print_msg("dmesg: %s", line); if (strstr(line, "resctrl:")) - printf("# dmesg: %s", line); + ksft_print_msg("dmesg: %s", line); } fclose(fp); waitpid(pid, NULL, 0); diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 592c1ccf4576..0bd73428d2f3 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -14,7 +14,7 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __packed __attribute__((packed)) -#include "../../../../arch/x86/kernel/cpu/sgx/arch.h" +#include "../../../../arch/x86/include/asm/sgx.h" #include "../../../../arch/x86/include/asm/enclu.h" #include "../../../../arch/x86/include/uapi/asm/sgx.h" diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 9d43b75aaa55..f441ac34b4d4 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -45,19 +45,19 @@ static bool encl_map_bin(const char *path, struct encl *encl) fd = open(path, O_RDONLY); if (fd == -1) { - perror("open()"); + perror("enclave executable open()"); return false; } ret = stat(path, &sb); if (ret) { - perror("stat()"); + perror("enclave executable stat()"); goto err; } bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (bin == MAP_FAILED) { - perror("mmap()"); + perror("enclave executable mmap()"); goto err; } @@ -90,8 +90,7 @@ static bool encl_ioc_create(struct encl *encl) ioc.src = (unsigned long)secs; rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc); if (rc) { - fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n", - errno); + perror("SGX_IOC_ENCLAVE_CREATE failed"); munmap((void *)secs->base, encl->encl_size); return false; } @@ -116,31 +115,72 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc); if (rc < 0) { - fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n", - errno); + perror("SGX_IOC_ENCLAVE_ADD_PAGES failed"); return false; } return true; } + + bool encl_load(const char *path, struct encl *encl) { + const char device_path[] = "/dev/sgx_enclave"; Elf64_Phdr *phdr_tbl; off_t src_offset; Elf64_Ehdr *ehdr; + struct stat sb; + void *ptr; int i, j; int ret; + int fd = -1; memset(encl, 0, sizeof(*encl)); - ret = open("/dev/sgx_enclave", O_RDWR); - if (ret < 0) { - fprintf(stderr, "Unable to open /dev/sgx_enclave\n"); + fd = open(device_path, O_RDWR); + if (fd < 0) { + perror("Unable to open /dev/sgx_enclave"); + goto err; + } + + ret = stat(device_path, &sb); + if (ret) { + perror("device file stat()"); + goto err; + } + + /* + * This just checks if the /dev file has these permission + * bits set. It does not check that the current user is + * the owner or in the owning group. + */ + if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { + fprintf(stderr, "no execute permissions on device file %s\n", device_path); + goto err; + } + + ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (ptr == (void *)-1) { + perror("mmap for read"); + goto err; + } + munmap(ptr, PAGE_SIZE); + +#define ERR_MSG \ +"mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \ +" Check that current user has execute permissions on %s and \n" \ +" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \ +" If so, remount it executable: mount -o remount,exec /dev\n\n" + + ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0); + if (ptr == (void *)-1) { + fprintf(stderr, ERR_MSG, device_path); goto err; } + munmap(ptr, PAGE_SIZE); - encl->fd = ret; + encl->fd = fd; if (!encl_map_bin(path, encl)) goto err; @@ -217,6 +257,8 @@ bool encl_load(const char *path, struct encl *encl) return true; err: + if (fd != -1) + close(fd); encl_delete(encl); return false; } @@ -229,7 +271,7 @@ static bool encl_map_area(struct encl *encl) area = mmap(NULL, encl_size * 2, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (area == MAP_FAILED) { - perror("mmap"); + perror("reservation mmap()"); return false; } @@ -268,8 +310,7 @@ bool encl_build(struct encl *encl) ioc.sigstruct = (uint64_t)&encl->sigstruct; ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc); if (ret) { - fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n", - errno); + perror("SGX_IOC_ENCLAVE_INIT failed"); return false; } diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 724cec700926..d304a4044eb9 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -15,6 +15,7 @@ #include <sys/stat.h> #include <sys/time.h> #include <sys/types.h> +#include <sys/auxv.h> #include "defines.h" #include "main.h" #include "../kselftest.h" @@ -28,24 +29,6 @@ struct vdso_symtab { Elf64_Word *elf_hashtab; }; -static void *vdso_get_base_addr(char *envp[]) -{ - Elf64_auxv_t *auxv; - int i; - - for (i = 0; envp[i]; i++) - ; - - auxv = (Elf64_auxv_t *)&envp[i + 1]; - - for (i = 0; auxv[i].a_type != AT_NULL; i++) { - if (auxv[i].a_type == AT_SYSINFO_EHDR) - return (void *)auxv[i].a_un.a_val; - } - - return NULL; -} - static Elf64_Dyn *vdso_get_dyntab(void *addr) { Elf64_Ehdr *ehdr = addr; @@ -162,7 +145,7 @@ static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r return 0; } -int main(int argc, char *argv[], char *envp[]) +int main(int argc, char *argv[]) { struct sgx_enclave_run run; struct vdso_symtab symtab; @@ -195,7 +178,7 @@ int main(int argc, char *argv[], char *envp[]) addr = mmap((void *)encl.encl_base + seg->offset, seg->size, seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); if (addr == MAP_FAILED) { - fprintf(stderr, "mmap() failed, errno=%d.\n", errno); + perror("mmap() segment failed"); exit(KSFT_FAIL); } } @@ -203,7 +186,8 @@ int main(int argc, char *argv[], char *envp[]) memset(&run, 0, sizeof(run)); run.tcs = encl.encl_base; - addr = vdso_get_base_addr(envp); + /* Get vDSO base address */ + addr = (void *)getauxval(AT_SYSINFO_EHDR); if (!addr) goto err; diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json index b8268da5adaa..8e45792703ed 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json @@ -764,5 +764,53 @@ "teardown": [ "$TC actions flush action police" ] + }, + { + "id": "cdd7", + "name": "Add valid police action with packets per second rate limit", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police pkts_rate 1000 pkts_burst 200 index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 rate 0bit burst 0b mtu 4096Mb pkts_rate 1000 pkts_burst 200", + "matchCount": "1", + "teardown": [ + "$TC actions flush action police" + ] + }, + { + "id": "f5bc", + "name": "Add invalid police action with both bps and pps", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police rate 1kbit burst 10k pkts_rate 1000 pkts_burst 200 index 1", + "expExitCode": "255", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 ", + "matchCount": "0", + "teardown": [ + "$TC actions flush action police" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json index 8e8c1ae12260..e0c5f060ccb9 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json @@ -24,6 +24,30 @@ ] }, { + "id": "4297", + "name": "Add simple action with change command", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions change action simple sdata \"Not changed\" index 60", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <Not changed>.*index 60 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] + }, + { "id": "6d4c", "name": "Add simple action with duplicate index", "category": [ @@ -151,5 +175,64 @@ "teardown": [ "$TC actions flush action simple" ] + }, + { + "id": "8d07", + "name": "Verify cleanup of failed actions batch add", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + "$TC actions add action simple sdata \"2\" index 2", + [ + "$TC actions add action simple sdata \"1\" index 1 action simple sdata \"2\" index 2", + 255 + ], + "$TC actions flush action simple" + ], + "cmdUnderTest": "$TC actions add action simple sdata \"2\" index 2", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <2>.*index 2 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] + }, + { + "id": "a68a", + "name": "Verify cleanup of failed actions batch change", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + [ + "$TC actions change action simple sdata \"1\" index 1 action simple sdata \"2\" goto chain 42 index 2", + 255 + ], + "$TC actions flush action simple" + ], + "cmdUnderTest": "$TC actions add action simple sdata \"1\" index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <1>.*index 1 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] } ] diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index bfc974b4572d..ef8eb3604595 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -3,7 +3,7 @@ * (C) Copyright IBM 2012 * Licensed under the GPLv2 * - * NOTE: This is a meta-test which quickly changes the clocksourc and + * NOTE: This is a meta-test which quickly changes the clocksource and * then uses other tests to detect problems. Thus this test requires * that the inconsistency-check and nanosleep tests be present in the * same directory it is run from. @@ -134,7 +134,7 @@ int main(int argv, char **argc) return -1; } - /* Check everything is sane before we start switching asyncrhonously */ + /* Check everything is sane before we start switching asynchronously */ for (i = 0; i < count; i++) { printf("Validating clocksource %s\n", clocksource_list[i]); if (change_clocksource(clocksource_list[i])) { diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 19e46ed5dfb5..23eb398c8140 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -5,7 +5,7 @@ * Licensed under the GPLv2 * * This test signals the kernel to insert a leap second - * every day at midnight GMT. This allows for stessing the + * every day at midnight GMT. This allows for stressing the * kernel's leap-second behavior, as well as how well applications * handle the leap-second discontinuity. * diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c index dc80728ed191..f70802c5dd0d 100644 --- a/tools/testing/selftests/timers/leapcrash.c +++ b/tools/testing/selftests/timers/leapcrash.c @@ -4,10 +4,10 @@ * (C) Copyright 2013, 2015 Linaro Limited * Licensed under the GPL * - * This test demonstrates leapsecond deadlock that is possibe + * This test demonstrates leapsecond deadlock that is possible * on kernels from 2.6.26 to 3.3. * - * WARNING: THIS WILL LIKELY HARDHANG SYSTEMS AND MAY LOSE DATA + * WARNING: THIS WILL LIKELY HARD HANG SYSTEMS AND MAY LOSE DATA * RUN AT YOUR OWN RISK! * To build: * $ gcc leapcrash.c -o leapcrash -lrt diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index cf3e48919874..80aed4bf06fb 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -76,7 +76,7 @@ void checklist(struct timespec *list, int size) /* The shared thread shares a global list * that each thread fills while holding the lock. - * This stresses clock syncronization across cpus. + * This stresses clock synchronization across cpus. */ void *shared_thread(void *arg) { diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d42115e4284d..8b0cd421ebd3 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -101,7 +101,7 @@ endef ifeq ($(CAN_BUILD_I386),1) $(BINARIES_32): CFLAGS += -m32 $(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): %_32: %.c +$(BINARIES_32): $(OUTPUT)/%_32: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif @@ -109,7 +109,7 @@ endif ifeq ($(CAN_BUILD_X86_64),1) $(BINARIES_64): CFLAGS += -m64 $(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): %_64: %.c +$(BINARIES_64): $(OUTPUT)/%_64: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) endif diff --git a/tools/testing/selftests/x86/thunks_32.S b/tools/testing/selftests/x86/thunks_32.S index a71d92da8f46..f3f56e681e9f 100644 --- a/tools/testing/selftests/x86/thunks_32.S +++ b/tools/testing/selftests/x86/thunks_32.S @@ -45,3 +45,5 @@ call64_from_32: ret .size call64_from_32, .-call64_from_32 + +.section .note.GNU-stack,"",%progbits diff --git a/tools/thermal/tmon/Makefile b/tools/thermal/tmon/Makefile index 59e417ec3e13..9db867df7679 100644 --- a/tools/thermal/tmon/Makefile +++ b/tools/thermal/tmon/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # We need this for the "cc-option" macro. -include ../../../scripts/Kbuild.include +include ../../build/Build.include VERSION = 1.0 diff --git a/tools/usb/usbip/doc/usbip.8 b/tools/usb/usbip/doc/usbip.8 index a15d20063b98..1f26e4a00638 100644 --- a/tools/usb/usbip/doc/usbip.8 +++ b/tools/usb/usbip/doc/usbip.8 @@ -50,9 +50,16 @@ Attach a remote USB device. .PP .HP +\fBattach\fR \-\-remote=<\fIhost\fR> \-\-device=<\fIdev_id\fR> +.IP +Attach a remote USB gadget. +Only used when the remote usbipd is in device mode. +.PP + +.HP \fBdetach\fR \-\-port=<\fIport\fR> .IP -Detach an imported USB device. +Detach an imported USB device/gadget. .PP .HP @@ -74,11 +81,25 @@ List USB devices exported by a remote host. .PP .HP +\fBlist\fR \-\-device +.IP +List USB gadgets of local usbip-vudc. +Only used when the local usbipd is in device mode. +Note that this can not list usbip-vudc USB gadgets of the remote device mode usbipd. +.PP + +.HP \fBlist\fR \-\-local .IP List local USB devices. .PP +.HP +\fBport\fR +.IP +List imported devices/gadgets. +.PP + .SH EXAMPLES @@ -90,8 +111,27 @@ List local USB devices. client:# usbip attach --remote=server --busid=1-2 - Connect the remote USB device. + client:# usbip port + - List imported devices/gadgets. + client:# usbip detach --port=0 - Detach the usb device. +The following example shows the usage of device mode + + server:# usbip list --device + - List gadgets exported by local usbipd server. + + client:# modprobe vhci-hcd + + client:# usbip attach --remote=server --device=usbip-vudc.0 + - Connect the remote USB gadget. + + client:# usbip port + - List imported devices/gadgets. + + client:# usbip detach --port=0 + - Detach the usb gadget. + .SH "SEE ALSO" \fBusbipd\fP\fB(8)\fB\fP diff --git a/tools/usb/usbip/doc/usbipd.8 b/tools/usb/usbip/doc/usbipd.8 index fb62a756893b..d974394f86a1 100644 --- a/tools/usb/usbip/doc/usbipd.8 +++ b/tools/usb/usbip/doc/usbipd.8 @@ -30,6 +30,12 @@ Bind to IPv6. Default is both. .PP .HP +\fB\-e\fR, \fB\-\-device\fR +.IP +Run in device mode. Rather than drive an attached device, create a virtual UDC to bind gadgets to. +.PP + +.HP \fB\-D\fR, \fB\-\-daemon\fR .IP Run as a daemon process. @@ -86,6 +92,26 @@ USB/IP client can connect and use exported devices. - A usb device 1-2 is now exportable to other hosts! - Use 'usbip unbind --busid=1-2' when you want to shutdown exporting and use the device locally. +The following example shows the usage of device mode + + server:# modprobe usbip-vudc + - Use /sys/class/udc/ interface. + - usbip-host is independent of this module. + + server:# usbipd -e -D + - Start usbip daemon in device mode. + + server:# modprobe g_mass_storage file=/tmp/tmp.img + - Bind a gadget to usbip-vudc. + - in this example, a mass storage gadget is bound. + + server:# usbip list --device + - List gadgets exported by local usbipd server. + + server:# modprobe -r g_mass_storage + - Unbind a gadget from usbip-vudc. + - in this example, the previous mass storage gadget is unbound. + .SH "SEE ALSO" \fBusbip\fP\fB(8)\fB\fP diff --git a/tools/usb/usbip/libsrc/list.h b/tools/usb/usbip/libsrc/list.h index a941671e4900..9cca2425587b 100644 --- a/tools/usb/usbip/libsrc/list.h +++ b/tools/usb/usbip/libsrc/list.h @@ -77,17 +77,17 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) #define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) #define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ -static inline void __list_del_entry(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); -} - static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); diff --git a/tools/usb/usbip/src/usbip_list.c b/tools/usb/usbip/src/usbip_list.c index 8625b0f514ee..3d810bcca02f 100644 --- a/tools/usb/usbip/src/usbip_list.c +++ b/tools/usb/usbip/src/usbip_list.c @@ -33,7 +33,8 @@ static const char usbip_list_usage_string[] = "usbip list [-p|--parsable] <args>\n" " -p, --parsable Parsable list format\n" " -r, --remote=<host> List the exportable USB devices on <host>\n" - " -l, --local List the local USB devices\n"; + " -l, --local List the local USB devices\n" + " -d, --device List the local USB gadgets bound to usbip-vudc\n"; void usbip_list_usage(void) { |