diff options
Diffstat (limited to 'tools')
98 files changed, 4879 insertions, 923 deletions
diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index a61051400311..253690eb3c26 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -35,6 +35,8 @@ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_PFX_EVEX 15 /* EVEX prefix */ +/* x86-64 REX2 prefix */ +#define INAT_PFX_REX2 16 /* 0xD5 */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -50,7 +52,7 @@ /* Legacy prefix */ #define INAT_PFX_OFFS 0 -#define INAT_PFX_BITS 4 +#define INAT_PFX_BITS 5 #define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) #define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) /* Escape opcodes */ @@ -77,6 +79,9 @@ #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) +#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) +#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) +#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -128,6 +133,11 @@ static inline int inat_is_rex_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_REX; } +static inline int inat_is_rex2_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX2; +} + static inline int inat_last_prefix_id(insn_attr_t attr) { if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) @@ -227,4 +237,9 @@ static inline int inat_must_evex(insn_attr_t attr) { return attr & INAT_EVEXONLY; } + +static inline int inat_evex_scalable(insn_attr_t attr) +{ + return attr & INAT_EVEX_SCALABLE; +} #endif diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 65c0d9ce1e29..0e5abd896ad4 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -112,10 +112,15 @@ struct insn { #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) -#define X86_REX_W(rex) ((rex) & 8) -#define X86_REX_R(rex) ((rex) & 4) -#define X86_REX_X(rex) ((rex) & 2) -#define X86_REX_B(rex) ((rex) & 1) +#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ +#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ +#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ +#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ + +#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ +#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ +#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ +#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ @@ -161,6 +166,18 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); +static inline int insn_is_rex2(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return insn->rex_prefix.nbytes == 2; +} + +static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) +{ + return X86_REX2_M(insn->rex_prefix.bytes[1]); +} + static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -198,6 +215,13 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_vex_w_bit(struct insn *insn) +{ + if (insn->vex_prefix.nbytes < 3) + return 0; + return X86_VEX_W(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index ada4b4a79dd4..a43b37346a22 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -185,6 +185,17 @@ found: if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; + } else if (inat_is_rex2_prefix(attr)) { + insn_set_byte(&insn->rex_prefix, 0, b); + b = peek_nbyte_next(insn_byte_t, insn, 1); + insn_set_byte(&insn->rex_prefix, 1, b); + insn->rex_prefix.nbytes = 2; + insn->next_byte += 2; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + insn->rex_prefix.got = 1; + goto vex_end; } } insn->rex_prefix.got = 1; @@ -283,6 +294,10 @@ int insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); + /* SCALABLE EVEX uses p bits to encode operand size */ + if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) && + p == INAT_PFX_OPNDSZ) + insn->opnd_bytes = 2; if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { @@ -294,6 +309,20 @@ int insn_get_opcode(struct insn *insn) goto end; } + /* Check if there is REX2 prefix or not */ + if (insn_is_rex2(insn)) { + if (insn_rex2_m_bit(insn)) { + /* map 1 is escape 0x0f */ + insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f); + + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr); + } else { + insn->attr = inat_get_opcode_attribute(op); + } + goto end; + } + insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 12af572201a2..caedb3ef6688 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -23,6 +23,7 @@ # # AVX Superscripts # (ev): this opcode requires EVEX prefix. +# (es): this opcode requires EVEX prefix and is SCALABALE. # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. @@ -33,6 +34,10 @@ # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. +# +# REX2 Prefix +# - (!REX2): REX2 is not allowed +# - (REX2): REX2 variant e.g. JMPABS Table: one byte opcode Referrer: @@ -148,7 +153,7 @@ AVXcode: 65: SEG=GS (Prefix) 66: Operand-Size (Prefix) 67: Address-Size (Prefix) -68: PUSH Iz (d64) +68: PUSH Iz 69: IMUL Gv,Ev,Iz 6a: PUSH Ib (d64) 6b: IMUL Gv,Ev,Ib @@ -157,22 +162,22 @@ AVXcode: 6e: OUTS/OUTSB DX,Xb 6f: OUTS/OUTSW/OUTSD DX,Xz # 0x70 - 0x7f -70: JO Jb -71: JNO Jb -72: JB/JNAE/JC Jb -73: JNB/JAE/JNC Jb -74: JZ/JE Jb -75: JNZ/JNE Jb -76: JBE/JNA Jb -77: JNBE/JA Jb -78: JS Jb -79: JNS Jb -7a: JP/JPE Jb -7b: JNP/JPO Jb -7c: JL/JNGE Jb -7d: JNL/JGE Jb -7e: JLE/JNG Jb -7f: JNLE/JG Jb +70: JO Jb (!REX2) +71: JNO Jb (!REX2) +72: JB/JNAE/JC Jb (!REX2) +73: JNB/JAE/JNC Jb (!REX2) +74: JZ/JE Jb (!REX2) +75: JNZ/JNE Jb (!REX2) +76: JBE/JNA Jb (!REX2) +77: JNBE/JA Jb (!REX2) +78: JS Jb (!REX2) +79: JNS Jb (!REX2) +7a: JP/JPE Jb (!REX2) +7b: JNP/JPO Jb (!REX2) +7c: JL/JNGE Jb (!REX2) +7d: JNL/JGE Jb (!REX2) +7e: JLE/JNG Jb (!REX2) +7f: JNLE/JG Jb (!REX2) # 0x80 - 0x8f 80: Grp1 Eb,Ib (1A) 81: Grp1 Ev,Iz (1A) @@ -208,24 +213,24 @@ AVXcode: 9e: SAHF 9f: LAHF # 0xa0 - 0xaf -a0: MOV AL,Ob -a1: MOV rAX,Ov -a2: MOV Ob,AL -a3: MOV Ov,rAX -a4: MOVS/B Yb,Xb -a5: MOVS/W/D/Q Yv,Xv -a6: CMPS/B Xb,Yb -a7: CMPS/W/D Xv,Yv -a8: TEST AL,Ib -a9: TEST rAX,Iz -aa: STOS/B Yb,AL -ab: STOS/W/D/Q Yv,rAX -ac: LODS/B AL,Xb -ad: LODS/W/D/Q rAX,Xv -ae: SCAS/B AL,Yb +a0: MOV AL,Ob (!REX2) +a1: MOV rAX,Ov (!REX2) | JMPABS O (REX2),(o64) +a2: MOV Ob,AL (!REX2) +a3: MOV Ov,rAX (!REX2) +a4: MOVS/B Yb,Xb (!REX2) +a5: MOVS/W/D/Q Yv,Xv (!REX2) +a6: CMPS/B Xb,Yb (!REX2) +a7: CMPS/W/D Xv,Yv (!REX2) +a8: TEST AL,Ib (!REX2) +a9: TEST rAX,Iz (!REX2) +aa: STOS/B Yb,AL (!REX2) +ab: STOS/W/D/Q Yv,rAX (!REX2) +ac: LODS/B AL,Xb (!REX2) +ad: LODS/W/D/Q rAX,Xv (!REX2) +ae: SCAS/B AL,Yb (!REX2) # Note: The May 2011 Intel manual shows Xv for the second parameter of the # next instruction but Yv is correct -af: SCAS/W/D/Q rAX,Yv +af: SCAS/W/D/Q rAX,Yv (!REX2) # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -266,7 +271,7 @@ d1: Grp2 Ev,1 (1A) d2: Grp2 Eb,CL (1A) d3: Grp2 Ev,CL (1A) d4: AAM Ib (i64) -d5: AAD Ib (i64) +d5: AAD Ib (i64) | REX2 (Prefix),(o64) d6: d7: XLAT/XLATB d8: ESC @@ -281,26 +286,26 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) -e1: LOOPE/LOOPZ Jb (f64) -e2: LOOP Jb (f64) -e3: JrCXZ Jb (f64) -e4: IN AL,Ib -e5: IN eAX,Ib -e6: OUT Ib,AL -e7: OUT Ib,eAX +e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) +e1: LOOPE/LOOPZ Jb (f64) (!REX2) +e2: LOOP Jb (f64) (!REX2) +e3: JrCXZ Jb (f64) (!REX2) +e4: IN AL,Ib (!REX2) +e5: IN eAX,Ib (!REX2) +e6: OUT Ib,AL (!REX2) +e7: OUT Ib,eAX (!REX2) # With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) -e9: JMP-near Jz (f64) -ea: JMP-far Ap (i64) -eb: JMP-short Jb (f64) -ec: IN AL,DX -ed: IN eAX,DX -ee: OUT DX,AL -ef: OUT DX,eAX +e8: CALL Jz (f64) (!REX2) +e9: JMP-near Jz (f64) (!REX2) +ea: JMP-far Ap (i64) (!REX2) +eb: JMP-short Jb (f64) (!REX2) +ec: IN AL,DX (!REX2) +ed: IN eAX,DX (!REX2) +ee: OUT DX,AL (!REX2) +ef: OUT DX,eAX (!REX2) # 0xf0 - 0xff f0: LOCK (Prefix) f1: @@ -386,14 +391,14 @@ AVXcode: 1 2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) 2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) # 0x0f 0x30-0x3f -30: WRMSR -31: RDTSC -32: RDMSR -33: RDPMC -34: SYSENTER -35: SYSEXIT +30: WRMSR (!REX2) +31: RDTSC (!REX2) +32: RDMSR (!REX2) +33: RDPMC (!REX2) +34: SYSENTER (!REX2) +35: SYSEXIT (!REX2) 36: -37: GETSEC +37: GETSEC (!REX2) 38: escape # 3-byte escape 1 39: 3a: escape # 3-byte escape 2 @@ -473,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) -81: JNO Jz (f64) -82: JB/JC/JNAE Jz (f64) -83: JAE/JNB/JNC Jz (f64) -84: JE/JZ Jz (f64) -85: JNE/JNZ Jz (f64) -86: JBE/JNA Jz (f64) -87: JA/JNBE Jz (f64) -88: JS Jz (f64) -89: JNS Jz (f64) -8a: JP/JPE Jz (f64) -8b: JNP/JPO Jz (f64) -8c: JL/JNGE Jz (f64) -8d: JNL/JGE Jz (f64) -8e: JLE/JNG Jz (f64) -8f: JNLE/JG Jz (f64) +80: JO Jz (f64) (!REX2) +81: JNO Jz (f64) (!REX2) +82: JB/JC/JNAE Jz (f64) (!REX2) +83: JAE/JNB/JNC Jz (f64) (!REX2) +84: JE/JZ Jz (f64) (!REX2) +85: JNE/JNZ Jz (f64) (!REX2) +86: JBE/JNA Jz (f64) (!REX2) +87: JA/JNBE Jz (f64) (!REX2) +88: JS Jz (f64) (!REX2) +89: JNS Jz (f64) (!REX2) +8a: JP/JPE Jz (f64) (!REX2) +8b: JNP/JPO Jz (f64) (!REX2) +8c: JL/JNGE Jz (f64) (!REX2) +8d: JNL/JGE Jz (f64) (!REX2) +8e: JLE/JNG Jz (f64) (!REX2) +8f: JNLE/JG Jz (f64) (!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) @@ -698,17 +703,17 @@ AVXcode: 2 4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev) 4e: vrsqrt14ps/d Vpd,Wpd (66),(ev) 4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev) -50: vpdpbusd Vx,Hx,Wx (66),(ev) -51: vpdpbusds Vx,Hx,Wx (66),(ev) -52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66),(ev) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) -53: vpdpwssds Vx,Hx,Wx (66),(ev) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) +50: vpdpbusd Vx,Hx,Wx (66) | vpdpbssd Vx,Hx,Wx (F2),(v) | vpdpbsud Vx,Hx,Wx (F3),(v) | vpdpbuud Vx,Hx,Wx (v) +51: vpdpbusds Vx,Hx,Wx (66) | vpdpbssds Vx,Hx,Wx (F2),(v) | vpdpbsuds Vx,Hx,Wx (F3),(v) | vpdpbuuds Vx,Hx,Wx (v) +52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) +53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) 54: vpopcntb/w Vx,Wx (66),(ev) 55: vpopcntd/q Vx,Wx (66),(ev) 58: vpbroadcastd Vx,Wx (66),(v) 59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo) 5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo) 5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev) -5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) +5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) | TDPFP16PS Vt,Wt,Ht (F2),(v1),(o64) # Skip 0x5d 5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1) # Skip 0x5f-0x61 @@ -718,10 +723,12 @@ AVXcode: 2 65: vblendmps/d Vx,Hx,Wx (66),(ev) 66: vpblendmb/w Vx,Hx,Wx (66),(ev) 68: vp2intersectd/q Kx,Hx,Wx (F2),(ev) -# Skip 0x69-0x6f +# Skip 0x69-0x6b +6c: TCMMIMFP16PS Vt,Wt,Ht (66),(v1),(o64) | TCMMRLFP16PS Vt,Wt,Ht (v1),(o64) +# Skip 0x6d-0x6f 70: vpshldvw Vx,Hx,Wx (66),(ev) 71: vpshldvd/q Vx,Hx,Wx (66),(ev) -72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3),(ev) | vpshrdvw Vx,Hx,Wx (66),(ev) +72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3) | vpshrdvw Vx,Hx,Wx (66),(ev) 73: vpshrdvd/q Vx,Hx,Wx (66),(ev) 75: vpermi2b/w Vx,Hx,Wx (66),(ev) 76: vpermi2d/q Vx,Hx,Wx (66),(ev) @@ -777,8 +784,10 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) -b4: vpmadd52luq Vx,Hx,Wx (66),(ev) -b5: vpmadd52huq Vx,Hx,Wx (66),(ev) +b0: vcvtneebf162ps Vx,Mx (F3),(!11B),(v) | vcvtneeph2ps Vx,Mx (66),(!11B),(v) | vcvtneobf162ps Vx,Mx (F2),(!11B),(v) | vcvtneoph2ps Vx,Mx (!11B),(v) +b1: vbcstnebf162ps Vx,Mw (F3),(!11B),(v) | vbcstnesh2ps Vx,Mw (66),(!11B),(v) +b4: vpmadd52luq Vx,Hx,Wx (66) +b5: vpmadd52huq Vx,Hx,Wx (66) b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) @@ -796,15 +805,35 @@ c7: Grp19 (1A) c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev) c9: sha1msg1 Vdq,Wdq ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev) -cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) -cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) -cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) +cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) | vsha512rnds2 Vqq,Hqq,Udq (F2),(11B),(v) +cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) | vsha512msg1 Vqq,Udq (F2),(11B),(v) +cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) | vsha512msg2 Vqq,Uqq (F2),(11B),(v) cf: vgf2p8mulb Vx,Wx (66) +d2: vpdpwsud Vx,Hx,Wx (F3),(v) | vpdpwusd Vx,Hx,Wx (66),(v) | vpdpwuud Vx,Hx,Wx (v) +d3: vpdpwsuds Vx,Hx,Wx (F3),(v) | vpdpwusds Vx,Hx,Wx (66),(v) | vpdpwuuds Vx,Hx,Wx (v) +d8: AESENCWIDE128KL Qpi (F3),(000),(00B) | AESENCWIDE256KL Qpi (F3),(000),(10B) | AESDECWIDE128KL Qpi (F3),(000),(01B) | AESDECWIDE256KL Qpi (F3),(000),(11B) +da: vsm3msg1 Vdq,Hdq,Udq (v1) | vsm3msg2 Vdq,Hdq,Udq (66),(v1) | vsm4key4 Vx,Hx,Wx (F3),(v) | vsm4rnds4 Vx,Hx,Wx (F2),(v) db: VAESIMC Vdq,Wdq (66),(v1) -dc: vaesenc Vx,Hx,Wx (66) -dd: vaesenclast Vx,Hx,Wx (66) -de: vaesdec Vx,Hx,Wx (66) -df: vaesdeclast Vx,Hx,Wx (66) +dc: vaesenc Vx,Hx,Wx (66) | LOADIWKEY Vx,Hx (F3) | AESENC128KL Vpd,Qpi (F3) +dd: vaesenclast Vx,Hx,Wx (66) | AESDEC128KL Vpd,Qpi (F3) +de: vaesdec Vx,Hx,Wx (66) | AESENC256KL Vpd,Qpi (F3) +df: vaesdeclast Vx,Hx,Wx (66) | AESDEC256KL Vpd,Qpi (F3) +e0: CMPOXADD My,Gy,By (66),(v1),(o64) +e1: CMPNOXADD My,Gy,By (66),(v1),(o64) +e2: CMPBXADD My,Gy,By (66),(v1),(o64) +e3: CMPNBXADD My,Gy,By (66),(v1),(o64) +e4: CMPZXADD My,Gy,By (66),(v1),(o64) +e5: CMPNZXADD My,Gy,By (66),(v1),(o64) +e6: CMPBEXADD My,Gy,By (66),(v1),(o64) +e7: CMPNBEXADD My,Gy,By (66),(v1),(o64) +e8: CMPSXADD My,Gy,By (66),(v1),(o64) +e9: CMPNSXADD My,Gy,By (66),(v1),(o64) +ea: CMPPXADD My,Gy,By (66),(v1),(o64) +eb: CMPNPXADD My,Gy,By (66),(v1),(o64) +ec: CMPLXADD My,Gy,By (66),(v1),(o64) +ed: CMPNLXADD My,Gy,By (66),(v1),(o64) +ee: CMPLEXADD My,Gy,By (66),(v1),(o64) +ef: CMPNLEXADD My,Gy,By (66),(v1),(o64) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) @@ -812,8 +841,11 @@ f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66) f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) -f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) +f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) | URDMSR Rq,Gq (F2),(11B) | UWRMSR Gq,Rq (F3),(11B) f9: MOVDIRI My,Gy +fa: ENCODEKEY128 Ew,Ew (F3) +fb: ENCODEKEY256 Ew,Ew (F3) +fc: AADD My,Gy | AAND My,Gy (66) | AOR My,Gy (F2) | AXOR My,Gy (F3) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) @@ -893,10 +925,103 @@ c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev) cc: sha1rnds4 Vdq,Wdq,Ib ce: vgf2p8affineqb Vx,Wx,Ib (66) cf: vgf2p8affineinvqb Vx,Wx,Ib (66) +de: vsm3rnds2 Vdq,Hdq,Wdq,Ib (66),(v1) df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B) EndTable +Table: EVEX map 4 +Referrer: +AVXcode: 4 +00: ADD Eb,Gb (ev) +01: ADD Ev,Gv (es) | ADD Ev,Gv (66),(es) +02: ADD Gb,Eb (ev) +03: ADD Gv,Ev (es) | ADD Gv,Ev (66),(es) +08: OR Eb,Gb (ev) +09: OR Ev,Gv (es) | OR Ev,Gv (66),(es) +0a: OR Gb,Eb (ev) +0b: OR Gv,Ev (es) | OR Gv,Ev (66),(es) +10: ADC Eb,Gb (ev) +11: ADC Ev,Gv (es) | ADC Ev,Gv (66),(es) +12: ADC Gb,Eb (ev) +13: ADC Gv,Ev (es) | ADC Gv,Ev (66),(es) +18: SBB Eb,Gb (ev) +19: SBB Ev,Gv (es) | SBB Ev,Gv (66),(es) +1a: SBB Gb,Eb (ev) +1b: SBB Gv,Ev (es) | SBB Gv,Ev (66),(es) +20: AND Eb,Gb (ev) +21: AND Ev,Gv (es) | AND Ev,Gv (66),(es) +22: AND Gb,Eb (ev) +23: AND Gv,Ev (es) | AND Gv,Ev (66),(es) +24: SHLD Ev,Gv,Ib (es) | SHLD Ev,Gv,Ib (66),(es) +28: SUB Eb,Gb (ev) +29: SUB Ev,Gv (es) | SUB Ev,Gv (66),(es) +2a: SUB Gb,Eb (ev) +2b: SUB Gv,Ev (es) | SUB Gv,Ev (66),(es) +2c: SHRD Ev,Gv,Ib (es) | SHRD Ev,Gv,Ib (66),(es) +30: XOR Eb,Gb (ev) +31: XOR Ev,Gv (es) | XOR Ev,Gv (66),(es) +32: XOR Gb,Eb (ev) +33: XOR Gv,Ev (es) | XOR Gv,Ev (66),(es) +# CCMPSCC instructions are: CCOMB, CCOMBE, CCOMF, CCOML, CCOMLE, CCOMNB, CCOMNBE, CCOMNL, CCOMNLE, +# CCOMNO, CCOMNS, CCOMNZ, CCOMO, CCOMS, CCOMT, CCOMZ +38: CCMPSCC Eb,Gb (ev) +39: CCMPSCC Ev,Gv (es) | CCMPSCC Ev,Gv (66),(es) +3a: CCMPSCC Gv,Ev (ev) +3b: CCMPSCC Gv,Ev (es) | CCMPSCC Gv,Ev (66),(es) +40: CMOVO Gv,Ev (es) | CMOVO Gv,Ev (66),(es) | CFCMOVO Ev,Ev (es) | CFCMOVO Ev,Ev (66),(es) | SETO Eb (F2),(ev) +41: CMOVNO Gv,Ev (es) | CMOVNO Gv,Ev (66),(es) | CFCMOVNO Ev,Ev (es) | CFCMOVNO Ev,Ev (66),(es) | SETNO Eb (F2),(ev) +42: CMOVB Gv,Ev (es) | CMOVB Gv,Ev (66),(es) | CFCMOVB Ev,Ev (es) | CFCMOVB Ev,Ev (66),(es) | SETB Eb (F2),(ev) +43: CMOVNB Gv,Ev (es) | CMOVNB Gv,Ev (66),(es) | CFCMOVNB Ev,Ev (es) | CFCMOVNB Ev,Ev (66),(es) | SETNB Eb (F2),(ev) +44: CMOVZ Gv,Ev (es) | CMOVZ Gv,Ev (66),(es) | CFCMOVZ Ev,Ev (es) | CFCMOVZ Ev,Ev (66),(es) | SETZ Eb (F2),(ev) +45: CMOVNZ Gv,Ev (es) | CMOVNZ Gv,Ev (66),(es) | CFCMOVNZ Ev,Ev (es) | CFCMOVNZ Ev,Ev (66),(es) | SETNZ Eb (F2),(ev) +46: CMOVBE Gv,Ev (es) | CMOVBE Gv,Ev (66),(es) | CFCMOVBE Ev,Ev (es) | CFCMOVBE Ev,Ev (66),(es) | SETBE Eb (F2),(ev) +47: CMOVNBE Gv,Ev (es) | CMOVNBE Gv,Ev (66),(es) | CFCMOVNBE Ev,Ev (es) | CFCMOVNBE Ev,Ev (66),(es) | SETNBE Eb (F2),(ev) +48: CMOVS Gv,Ev (es) | CMOVS Gv,Ev (66),(es) | CFCMOVS Ev,Ev (es) | CFCMOVS Ev,Ev (66),(es) | SETS Eb (F2),(ev) +49: CMOVNS Gv,Ev (es) | CMOVNS Gv,Ev (66),(es) | CFCMOVNS Ev,Ev (es) | CFCMOVNS Ev,Ev (66),(es) | SETNS Eb (F2),(ev) +4a: CMOVP Gv,Ev (es) | CMOVP Gv,Ev (66),(es) | CFCMOVP Ev,Ev (es) | CFCMOVP Ev,Ev (66),(es) | SETP Eb (F2),(ev) +4b: CMOVNP Gv,Ev (es) | CMOVNP Gv,Ev (66),(es) | CFCMOVNP Ev,Ev (es) | CFCMOVNP Ev,Ev (66),(es) | SETNP Eb (F2),(ev) +4c: CMOVL Gv,Ev (es) | CMOVL Gv,Ev (66),(es) | CFCMOVL Ev,Ev (es) | CFCMOVL Ev,Ev (66),(es) | SETL Eb (F2),(ev) +4d: CMOVNL Gv,Ev (es) | CMOVNL Gv,Ev (66),(es) | CFCMOVNL Ev,Ev (es) | CFCMOVNL Ev,Ev (66),(es) | SETNL Eb (F2),(ev) +4e: CMOVLE Gv,Ev (es) | CMOVLE Gv,Ev (66),(es) | CFCMOVLE Ev,Ev (es) | CFCMOVLE Ev,Ev (66),(es) | SETLE Eb (F2),(ev) +4f: CMOVNLE Gv,Ev (es) | CMOVNLE Gv,Ev (66),(es) | CFCMOVNLE Ev,Ev (es) | CFCMOVNLE Ev,Ev (66),(es) | SETNLE Eb (F2),(ev) +60: MOVBE Gv,Ev (es) | MOVBE Gv,Ev (66),(es) +61: MOVBE Ev,Gv (es) | MOVBE Ev,Gv (66),(es) +65: WRUSSD Md,Gd (66),(ev) | WRUSSQ Mq,Gq (66),(ev) +66: ADCX Gy,Ey (66),(ev) | ADOX Gy,Ey (F3),(ev) | WRSSD Md,Gd (ev) | WRSSQ Mq,Gq (66),(ev) +69: IMUL Gv,Ev,Iz (es) | IMUL Gv,Ev,Iz (66),(es) +6b: IMUL Gv,Ev,Ib (es) | IMUL Gv,Ev,Ib (66),(es) +80: Grp1 Eb,Ib (1A),(ev) +81: Grp1 Ev,Iz (1A),(es) +83: Grp1 Ev,Ib (1A),(es) +# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, +# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ +84: CTESTSCC (ev) +85: CTESTSCC (es) | CTESTSCC (66),(es) +88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) +8f: POP2 Bq,Rq (000),(11B),(ev) +a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) +ad: SHRD Ev,Gv,CL (es) | SHRD Ev,Gv,CL (66),(es) +af: IMUL Gv,Ev (es) | IMUL Gv,Ev (66),(es) +c0: Grp2 Eb,Ib (1A),(ev) +c1: Grp2 Ev,Ib (1A),(es) +d0: Grp2 Eb,1 (1A),(ev) +d1: Grp2 Ev,1 (1A),(es) +d2: Grp2 Eb,CL (1A),(ev) +d3: Grp2 Ev,CL (1A),(es) +f0: CRC32 Gy,Eb (es) | INVEPT Gq,Mdq (F3),(ev) +f1: CRC32 Gy,Ey (es) | CRC32 Gy,Ey (66),(es) | INVVPID Gy,Mdq (F3),(ev) +f2: INVPCID Gy,Mdq (F3),(ev) +f4: TZCNT Gv,Ev (es) | TZCNT Gv,Ev (66),(es) +f5: LZCNT Gv,Ev (es) | LZCNT Gv,Ev (66),(es) +f6: Grp3_1 Eb (1A),(ev) +f7: Grp3_2 Ev (1A),(es) +f8: MOVDIR64B Gv,Mdqq (66),(ev) | ENQCMD Gv,Mdqq (F2),(ev) | ENQCMDS Gv,Mdqq (F3),(ev) | URDMSR Rq,Gq (F2),(11B),(ev) | UWRMSR Gq,Rq (F3),(11B),(ev) +f9: MOVDIRI My,Gy (ev) +fe: Grp4 (1A),(ev) +ff: Grp5 (1A),(es) | PUSH2 Bq,Rq (110),(11B),(ev) +EndTable + Table: EVEX map 5 Referrer: AVXcode: 5 @@ -975,6 +1100,12 @@ d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev) d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev) EndTable +Table: VEX map 7 +Referrer: +AVXcode: 7 +f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1051,7 +1182,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) | RDMSRLIST (F2),(110),(11B) | WRMSRLIST (F3),(110),(11B) | PBNDKB (111),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms @@ -1137,6 +1268,8 @@ GrpTable: Grp16 1: prefetch T0 2: prefetch T1 3: prefetch T2 +6: prefetch IT1 +7: prefetch IT0 EndTable GrpTable: Grp17 diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index af38469afd14..5770c8097f32 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -64,7 +64,9 @@ BEGIN { modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" - rex_expr = "^REX(\\.[XRWB]+)*" + rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))" + rex2_expr = "\\(REX2\\)" + no_rex2_expr = "\\(!REX2\\)" fpu_expr = "^ESC" # TODO lprefix1_expr = "\\((66|!F3)\\)" @@ -81,6 +83,8 @@ BEGIN { vexonly_expr = "\\(v\\)" # All opcodes with (ev) superscript supports *only* EVEX prefix evexonly_expr = "\\(ev\\)" + # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size + evex_scalable_expr = "\\(es\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -99,6 +103,7 @@ BEGIN { prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" + prefix_num["REX2"] = "INAT_PFX_REX2" clear_vars() } @@ -314,6 +319,10 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, force64_expr)) flags = add_flags(flags, "INAT_FORCE64") + # check REX2 not allowed + if (match(ext, no_rex2_expr)) + flags = add_flags(flags, "INAT_NO_REX2") + # check REX prefix if (match(opcode, rex_expr)) flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") @@ -325,6 +334,8 @@ function convert_operands(count,opnd, i,j,imm,mod) # check VEX codes if (match(ext, evexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY") + else if (match(ext, evex_scalable_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY | INAT_EVEX_SCALABLE") else if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) @@ -351,6 +362,8 @@ function convert_operands(count,opnd, i,j,imm,mod) lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" } + if (match(ext, rex2_expr)) + table[idx] = add_flags(table[idx], "INAT_REX2_VARIANT") if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index 1d3a90d93fe2..270c28a0d098 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -146,12 +146,11 @@ def detect_kernel_config(): def for_each_slab(prog): - PGSlab = 1 << prog.constant('PG_slab') - PGHead = 1 << prog.constant('PG_head') + PGSlab = ~prog.constant('PG_slab') for page in for_each_page(prog): try: - if page.flags.value_() & PGSlab: + if page.page_type.value_() == PGSlab: yield cast('struct slab *', page) except FaultError: pass diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 570bb9794421..95483c7d81df 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; + rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { - rb->__rb_parent_color = (unsigned long)p | color; + rb->__rb_parent_color = (unsigned long)p + color; } static inline void diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 2190adbe3002..ea32b101b999 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1221,7 +1221,7 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) diff --git a/tools/include/uapi/linux/memfd.h b/tools/include/uapi/linux/memfd.h new file mode 100644 index 000000000000..01c0324e7733 --- /dev/null +++ b/tools/include/uapi/linux/memfd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_MEMFD_H +#define _LINUX_MEMFD_H + +#include <asm-generic/hugetlb_encode.h> + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U +#define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U + +/* + * Huge page size encoding when MFD_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB +#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB +#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + +#endif /* _LINUX_MEMFD_H */ diff --git a/tools/include/uapi/linux/userfaultfd.h b/tools/include/uapi/linux/userfaultfd.h new file mode 100644 index 000000000000..4283de22d5b6 --- /dev/null +++ b/tools/include/uapi/linux/userfaultfd.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * include/linux/userfaultfd.h + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2015 Red Hat, Inc. + * + */ + +#ifndef _LINUX_USERFAULTFD_H +#define _LINUX_USERFAULTFD_H + +#include <linux/types.h> + +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ +#define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) + +/* + * Valid ioctl command number range with this API is from 0x00 to + * 0x3F. UFFDIO_API is the fixed number, everything else can be + * changed by implementing a different UFFD_API. If sticking to the + * same UFFD_API more ioctl can be added and userland will be aware of + * which ioctl the running kernel implements through the ioctl command + * bitmask written by the UFFDIO_API. + */ +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) +#define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) +#define _UFFDIO_API (0x3F) + +/* userfaultfd ioctl ids */ +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) + +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + union { + __u32 ptid; + } feat; + } pagefault; + + struct { + __u32 ufd; + } fork; + + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + + struct { + __u64 start; + __u64 end; + } remove; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __attribute__((packed)); + +/* + * Start at 0x12 and not at 0 to be more strict against bugs. + */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ + +struct uffdio_api { + /* userland asks for an API number and the features to enable */ + __u64 api; + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. + */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) + __u64 features; + + __u64 ioctls; +}; + +struct uffdio_range { + __u64 start; + __u64 len; +}; + +struct uffdio_register { + struct uffdio_range range; +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) + __u64 mode; + + /* + * kernel answers which ioctl commands are available for the + * range, keep at the end as the last 8 bytes aren't read. + */ + __u64 ioctls; +}; + +struct uffdio_copy { + __u64 dst; + __u64 src; + __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * "copy" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 copy; +}; + +struct uffdio_zeropage { + struct uffdio_range range; +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "zeropage" is written by the ioctl and must be at the end: + * the copy_from_user will not read the last 8 bytes. + */ + __s64 zeropage; +}; + +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + +#endif /* _LINUX_USERFAULTFD_H */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 466a29d80124..2a4c71501a17 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -105,7 +105,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) */ int probe_memcg_account(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), BPF_EXIT_INSN(), diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 4e783cc7fc4b..a336786a22a3 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -22,7 +22,7 @@ int probe_fd(int fd) static int probe_kern_prog_name(int token_fd) { - const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 727396de6be5..9e7307186b7f 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 92e139b9c792..2d6dce2c8f77 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -3,6 +3,8 @@ CC = $(CROSS_COMPILE)gcc BUILD_OUTPUT := $(CURDIR) PREFIX ?= /usr DESTDIR ?= +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = turbostat-$(DAY) ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -22,9 +24,30 @@ override CFLAGS += -D_FORTIFY_SOURCE=2 .PHONY : clean clean : @rm -f $(BUILD_OUTPUT)/turbostat + @rm -f $(SNAPSHOT).tar.gz install : turbostat - install -d $(DESTDIR)$(PREFIX)/bin + install -d $(DESTDIR)$(PREFIX)/bin install $(BUILD_OUTPUT)/turbostat $(DESTDIR)$(PREFIX)/bin/turbostat - install -d $(DESTDIR)$(PREFIX)/share/man/man8 + install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 turbostat.8 $(DESTDIR)$(PREFIX)/share/man/man8 + +snapshot: turbostat + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp turbostat Makefile turbostat.c turbostat.8 ../../../../arch/x86/include/asm/intel-family.h $(SNAPSHOT) + + @sed -e 's/^#include <linux\/bits.h>/#include "bits.h"/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DINTEL_FAMILY_HEADER='\"intel-family.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' -e's/.*INTEL_FAMILY_HEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 0d3672e5d9ed..8d37acd39201 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -155,7 +155,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .PP -\fBUncMHz\fP uncore MHz, instantaneous sample. +\fBUncMHz\fP per-package uncore MHz, instantaneous sample. +.PP +\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98256468e248..8cdf41906e98 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -38,6 +38,7 @@ #include <stdbool.h> #include <assert.h> #include <linux/kernel.h> +#include <linux/build_bug.h> #define UNUSED(x) (void)(x) @@ -58,15 +59,22 @@ #define MAX_NOFILE 0x8000 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; +enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; + +struct sysfs_path { + char path[PATH_BYTES]; + int id; + struct sysfs_path *next; +}; struct msr_counter { unsigned int msr_num; char name[NAME_BYTES]; - char path[PATH_BYTES]; + struct sysfs_path *sp; unsigned int width; enum counter_type type; enum counter_format format; @@ -78,64 +86,64 @@ struct msr_counter { }; struct msr_counter bic[] = { - { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, - { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, - { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, - { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, - { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, - { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, - { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, - { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -216,6 +224,28 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +/* + * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: + * If you change the values, note they are used both in comparisons + * (>= PCL__7) and to index pkg_cstate_limit_strings[]. + */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ + struct amperf_group_fd; char *proc_stat = "/proc/stat"; @@ -299,6 +329,9 @@ struct gfx_sysfs_info { static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); +int add_counter(unsigned int msr_num, char *path, char *name, + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -663,6 +696,23 @@ static const struct platform_features adl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features arl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -905,8 +955,10 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE, &cnl_features }, - { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, + { INTEL_FAM6_ARROWLAKE_H, &arl_features }, + { INTEL_FAM6_ARROWLAKE_U, &arl_features }, + { INTEL_FAM6_ARROWLAKE, &arl_features }, + { INTEL_FAM6_LUNARLAKE_M, &arl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, @@ -979,8 +1031,9 @@ char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; -#define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 +#define MAX_ADDED_CORE_COUNTERS 8 +#define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 /* Indexes used to map data read from perf and MSRs into global variables */ @@ -1022,6 +1075,7 @@ struct rapl_counter_info_t { /* struct rapl_counter_info_t for each RAPL domain */ struct rapl_counter_info_t *rapl_counter_info_perdomain; +unsigned int rapl_counter_info_perdomain_size; #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) @@ -1152,6 +1206,161 @@ struct rapl_counter { double scale; }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum ccstate_rci_index { + CCSTATE_RCI_INDEX_C1_RESIDENCY = 0, + CCSTATE_RCI_INDEX_C3_RESIDENCY = 1, + CCSTATE_RCI_INDEX_C6_RESIDENCY = 2, + CCSTATE_RCI_INDEX_C7_RESIDENCY = 3, + PCSTATE_RCI_INDEX_C2_RESIDENCY = 4, + PCSTATE_RCI_INDEX_C3_RESIDENCY = 5, + PCSTATE_RCI_INDEX_C6_RESIDENCY = 6, + PCSTATE_RCI_INDEX_C7_RESIDENCY = 7, + PCSTATE_RCI_INDEX_C8_RESIDENCY = 8, + PCSTATE_RCI_INDEX_C9_RESIDENCY = 9, + PCSTATE_RCI_INDEX_C10_RESIDENCY = 10, + NUM_CSTATE_COUNTERS, +}; + +struct cstate_counter_info_t { + unsigned long long data[NUM_CSTATE_COUNTERS]; + enum cstate_source source[NUM_CSTATE_COUNTERS]; + unsigned long long msr[NUM_CSTATE_COUNTERS]; + int fd_perf_core; + int fd_perf_pkg; +}; + +struct cstate_counter_info_t *ccstate_counter_info; +unsigned int ccstate_counter_info_size; + +#define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE (1u << 0) +#define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE) +#define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2) + +struct cstate_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + unsigned long long flags; + int pkg_cstate_limit; +}; + +static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { + { + .feature_mask = CC1, + .perf_subsys = "cstate_core", + .perf_name = "c1-residency", + .msr = MSR_CORE_C1_RES, + .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY, + .bic = BIC_CPU_c1, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC3, + .perf_subsys = "cstate_core", + .perf_name = "c3-residency", + .msr = MSR_CORE_C3_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_CPU_c3, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC6, + .perf_subsys = "cstate_core", + .perf_name = "c6-residency", + .msr = MSR_CORE_C6_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_CPU_c6, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = CC7, + .perf_subsys = "cstate_core", + .perf_name = "c7-residency", + .msr = MSR_CORE_C7_RESIDENCY, + .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_CPU_c7, + .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY, + .pkg_cstate_limit = 0, + }, + { + .feature_mask = PC2, + .perf_subsys = "cstate_pkg", + .perf_name = "c2-residency", + .msr = MSR_PKG_C2_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY, + .bic = BIC_Pkgpc2, + .flags = 0, + .pkg_cstate_limit = PCL__2, + }, + { + .feature_mask = PC3, + .perf_subsys = "cstate_pkg", + .perf_name = "c3-residency", + .msr = MSR_PKG_C3_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY, + .bic = BIC_Pkgpc3, + .flags = 0, + .pkg_cstate_limit = PCL__3, + }, + { + .feature_mask = PC6, + .perf_subsys = "cstate_pkg", + .perf_name = "c6-residency", + .msr = MSR_PKG_C6_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY, + .bic = BIC_Pkgpc6, + .flags = 0, + .pkg_cstate_limit = PCL__6, + }, + { + .feature_mask = PC7, + .perf_subsys = "cstate_pkg", + .perf_name = "c7-residency", + .msr = MSR_PKG_C7_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY, + .bic = BIC_Pkgpc7, + .flags = 0, + .pkg_cstate_limit = PCL__7, + }, + { + .feature_mask = PC8, + .perf_subsys = "cstate_pkg", + .perf_name = "c8-residency", + .msr = MSR_PKG_C8_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY, + .bic = BIC_Pkgpc8, + .flags = 0, + .pkg_cstate_limit = PCL__8, + }, + { + .feature_mask = PC9, + .perf_subsys = "cstate_pkg", + .perf_name = "c9-residency", + .msr = MSR_PKG_C9_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY, + .bic = BIC_Pkgpc9, + .flags = 0, + .pkg_cstate_limit = PCL__9, + }, + { + .feature_mask = PC10, + .perf_subsys = "cstate_pkg", + .perf_name = "c10-residency", + .msr = MSR_PKG_C10_RESIDENCY, + .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY, + .bic = BIC_Pkgpc10, + .flags = 0, + .pkg_cstate_limit = PCL_10, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1181,7 +1390,7 @@ struct core_data { struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1214,7 +1423,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; - unsigned long long counter[MAX_ADDED_COUNTERS]; + unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1357,36 +1566,42 @@ struct sys_counters { struct msr_counter *pp; } sys; -void free_sys_counters(void) +static size_t free_msr_counters_(struct msr_counter **pp) { - struct msr_counter *p = sys.tp, *pnext = NULL; + struct msr_counter *p = NULL; + size_t num_freed = 0; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + while (*pp) { + p = *pp; - p = sys.cp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; - } + if (p->msr_num != 0) { + *pp = p->next; + + free(p); + ++num_freed; - p = sys.pp, pnext = NULL; - while (p) { - pnext = p->next; - free(p); - p = pnext; + continue; + } + + pp = &p->next; } - sys.added_thread_counters = 0; - sys.added_core_counters = 0; - sys.added_package_counters = 0; - sys.tp = NULL; - sys.cp = NULL; - sys.pp = NULL; + return num_freed; +} + +/* + * Free all added counters accessed via msr. + */ +static void free_sys_msr_counters(void) +{ + /* Thread counters */ + sys.added_thread_counters -= free_msr_counters_(&sys.tp); + + /* Core counters */ + sys.added_core_counters -= free_msr_counters_(&sys.cp); + + /* Package counters */ + sys.added_package_counters -= free_msr_counters_(&sys.pp); } struct system_summary { @@ -1415,6 +1630,9 @@ struct topo_params { int allowed_cpus; int allowed_cores; int max_cpu_num; + int max_core_id; + int max_package_id; + int max_die_id; int max_node_num; int nodes_per_pkg; int cores_per_node; @@ -1529,23 +1747,12 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = - BIC_SMI | - BIC_CPU_c1 | - BIC_CPU_c3 | - BIC_CPU_c6 | - BIC_CPU_c7 | - BIC_Mod_c6 | - BIC_CoreTmp | - BIC_Totl_c0 | - BIC_Any_c0 | - BIC_GFX_c0 | - BIC_CPUGFX | - BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; - free_sys_counters(); + free_sys_msr_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -1928,13 +2135,15 @@ void print_header(char *delim) if (mp->format == FORMAT_RAW) { if (mp->width == 64) outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else + else if (mp->width == 32) outp += sprintf(outp, "%s%10.10s", delim, mp->name); + else + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } else { if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) outp += sprintf(outp, "%s%8s", delim, mp->name); else - outp += sprintf(outp, "%s%s", delim, mp->name); + outp += sprintf(outp, "%s%7.7s", delim, mp->name); } } @@ -1966,7 +2175,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->path); + t->counter[i], mp->sp->path); } } @@ -1987,7 +2196,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->path); + c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -2023,7 +2232,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->path); + p->counter[i], mp->sp->path); } } @@ -2388,7 +2597,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } + } else if (mp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } done: @@ -2498,6 +2708,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; + else if (mp->format == FORMAT_AVERAGE) + old->counter[i] = new->counter[i]; else old->counter[i] = new->counter[i] - old->counter[i]; } @@ -2970,7 +3182,7 @@ unsigned long long snapshot_sysfs_counter(char *path) return counter; } -int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) +int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path) { if (mp->msr_num != 0) { assert(!no_msr); @@ -2980,25 +3192,40 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path); *counterp = snapshot_sysfs_counter(path); } else { - *counterp = snapshot_sysfs_counter(mp->path); + *counterp = snapshot_sysfs_counter(counter_path); } } return 0; } -unsigned long long get_uncore_mhz(int package, int die) +unsigned long long get_legacy_uncore_mhz(int package) { char path[128]; + int die; + static int warn_once; + + /* + * for this package, use the first die_id that exists + */ + for (die = 0; die <= topo.max_die_id; ++die) { - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, - die); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", + package, die); - return (snapshot_sysfs_counter(path) / 1000); + if (access(path, R_OK) == 0) + return (snapshot_sysfs_counter(path) / 1000); + } + if (!warn_once) { + warnx("BUG: %s: No %s", __func__, path); + warn_once = 1; + } + + return 0; } int get_epb(int cpu) @@ -3361,6 +3588,17 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) return ret; } +static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) + if (cci->source[i] == CSTATE_SOURCE_PERF) + ++ret; + + return ret; +} + void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) { rc->raw_value = rci->data[idx]; @@ -3368,15 +3606,18 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci rc->scale = rci->scale[idx]; } -int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p) { unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; - struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + struct rapl_counter_info_t *rci; if (debug) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); + assert(domain < rapl_counter_info_perdomain_size); + + rci = &rapl_counter_info_perdomain[domain]; /* * If we have any perf counters to read, read them all now, in bulk @@ -3432,7 +3673,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data } } - _Static_assert(NUM_RAPL_COUNTERS == 7); + BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7); write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); @@ -3444,6 +3685,154 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data return 0; } +char *find_sysfs_path_by_id(struct sysfs_path *sp, int id) +{ + while (sp) { + if (sp->id == id) + return (sp->path); + sp = sp->next; + } + if (debug) + warnx("%s: id%d not found", __func__, id); + return NULL; +} + +int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + /* + * Overcommit memory a little bit here, + * but skip calculating exact sizes for the buffers. + */ + unsigned long long perf_data[NUM_CSTATE_COUNTERS]; + unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1]; + unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1]; + + struct cstate_counter_info_t *cci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(ccstate_counter_info); + assert(cpu <= ccstate_counter_info_size); + + memset(perf_data, 0, sizeof(perf_data)); + memset(perf_data_core, 0, sizeof(perf_data_core)); + memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + + cci = &ccstate_counter_info[cpu]; + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + const size_t num_perf_counters = cstate_counter_info_count_perf(cci); + ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long); + ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0; + + if (cci->fd_perf_core != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core)); + + if (actual_read_size_core <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core); + } + + if (cci->fd_perf_pkg != -1) { + /* Each descriptor read begins with number of counters read. */ + expected_read_size += sizeof(unsigned long long); + + actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg)); + + if (actual_read_size_pkg <= 0) + err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg); + } + + const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg; + + if (actual_read_size_total != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total); + + /* + * Copy ccstate and pcstate data into unified buffer. + * + * Skip first element from core and pkg buffers. + * Kernel puts there how many counters were read. + */ + const size_t num_core_counters = perf_data_core[0]; + const size_t num_pkg_counters = perf_data_pkg[0]; + + assert(num_perf_counters == num_core_counters + num_pkg_counters); + + /* Copy ccstate perf data */ + memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long)); + + /* Copy pcstate perf data */ + memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long)); + + for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { + switch (cci->source[i]) { + case CSTATE_SOURCE_NONE: + break; + + case CSTATE_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); + + if (debug) { + fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); + } + + cci->data[i] = perf_data[pi]; + + ++pi; + break; + + case CSTATE_SOURCE_MSR: + assert(!no_msr); + if (get_msr(cpu, cci->msr[i], &cci->data[i])) + return -13 - i; + + if (debug) { + fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); + } + + break; + } + } + + /* + * Helper to write the data only if the source of + * the counter for the current cpu is not none. + * + * Otherwise we would overwrite core data with 0 (default value), + * when invoked for the thread sibling. + */ +#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ + if (cci->source[index] != CSTATE_SOURCE_NONE) \ + out_counter = cci->data[index]; \ +} while (0) + + BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11); + + PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY); + + PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY); + PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY); + +#undef PERF_COUNTER_WRITE_DATA + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3499,13 +3888,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { - if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) - return -6; - } + + get_cstate_counters(cpu, t, c, p); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &t->counter[i])) + if (get_mp(cpu, mp, &t->counter[i], mp->sp->path)) return -10; } @@ -3519,31 +3906,14 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return status; } - if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { - if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) - return -6; - } - - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { - if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { - if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) - return -7; - } - - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { - if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) - return -8; - else if (t->is_atom) { - /* - * For Atom CPUs that has core cstate deeper than c6, - * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. - * Minus CC7 (and deeper cstates) residency to get - * accturate cc6 residency. - */ - c->c6 -= c->c7; - } + if (DO_BIC(BIC_CPU_c7) && t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; } if (DO_BIC(BIC_Mod_c6)) @@ -3560,7 +3930,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) get_core_throt_cnt(cpu, &c->core_throt_cnt); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &c->counter[i])) + if (get_mp(cpu, mp, &c->counter[i], mp->sp->path)) return -10; } @@ -3584,34 +3954,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0)) return -13; } - if (DO_BIC(BIC_Pkgpc3)) - if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) - return -9; - if (DO_BIC(BIC_Pkgpc6)) { - if (platform->has_msr_atom_pkg_c6_residency) { - if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } else { - if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) - return -10; - } - } - - if (DO_BIC(BIC_Pkgpc2)) - if (get_msr(cpu, MSR_PKG_C2_RESIDENCY, &p->pc2)) - return -11; - if (DO_BIC(BIC_Pkgpc7)) - if (get_msr(cpu, MSR_PKG_C7_RESIDENCY, &p->pc7)) - return -12; - if (DO_BIC(BIC_Pkgpc8)) - if (get_msr(cpu, MSR_PKG_C8_RESIDENCY, &p->pc8)) - return -13; - if (DO_BIC(BIC_Pkgpc9)) - if (get_msr(cpu, MSR_PKG_C9_RESIDENCY, &p->pc9)) - return -13; - if (DO_BIC(BIC_Pkgpc10)) - if (get_msr(cpu, MSR_PKG_C10_RESIDENCY, &p->pc10)) - return -13; if (DO_BIC(BIC_CPU_LPI)) p->cpu_lpi = cpuidle_cur_cpu_lpi_us; @@ -3630,9 +3972,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) - p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + p->uncore_mhz = get_legacy_uncore_mhz(p->package_id); if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; @@ -3653,7 +3994,16 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (get_mp(cpu, mp, &p->counter[i])) + char *path = NULL; + + if (mp->msr_num == 0) { + path = find_sysfs_path_by_id(mp->sp, p->package_id); + if (path == NULL) { + warnx("%s: package_id %d not found", __func__, p->package_id); + return -10; + } + } + if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } done: @@ -3662,31 +4012,8 @@ done: return 0; } -/* - * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit: - * If you change the values, note they are used both in comparisons - * (>= PCL__7) and to index pkg_cstate_limit_strings[]. - */ - -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ - int pkg_cstate_limit = PCLUKN; -char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", +char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" }; @@ -4175,12 +4502,32 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_cstate(void) +{ + if (!ccstate_counter_info) + return; + + const int counter_info_num = ccstate_counter_info_size; + + for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) { + if (ccstate_counter_info[counter_id].fd_perf_core != -1) + close(ccstate_counter_info[counter_id].fd_perf_core); + + if (ccstate_counter_info[counter_id].fd_perf_pkg != -1) + close(ccstate_counter_info[counter_id].fd_perf_pkg); + } + + free(ccstate_counter_info); + ccstate_counter_info = NULL; + ccstate_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) return; - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const int num_domains = rapl_counter_info_perdomain_size; for (int domain_id = 0; domain_id < num_domains; ++domain_id) { if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) @@ -4188,6 +4535,8 @@ void free_fd_rapl_percpu(void) } free(rapl_counter_info_perdomain); + rapl_counter_info_perdomain = NULL; + rapl_counter_info_perdomain_size = 0; } void free_all_buffers(void) @@ -4234,6 +4583,7 @@ void free_all_buffers(void) free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free_fd_rapl_percpu(); + free_fd_cstate(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4569,6 +4919,7 @@ static void update_effective_set(bool startup) void linux_perf_init(void); void rapl_perf_init(void); +void cstate_perf_init(void); void re_initialize(void) { @@ -4576,6 +4927,7 @@ void re_initialize(void) setup_all_buffers(false); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5294,30 +5646,27 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void probe_intel_uncore_frequency(void) +static void probe_intel_uncore_frequency_legacy(void) { int i, j; char path[256]; - if (!genuine_intel) - return; - - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - goto probe_cluster; - - BIC_PRESENT(BIC_UNCORE_MHZ); - - if (quiet) - return; - for (i = 0; i < topo.num_packages; ++i) { - for (j = 0; j < topo.num_die; ++j) { + for (j = 0; j <= topo.max_die_id; ++j) { int k, l; char path_base[128]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); + if (access(path_base, R_OK)) + continue; + + BIC_PRESENT(BIC_UNCORE_MHZ); + + if (quiet) + return; + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); @@ -5335,24 +5684,39 @@ static void probe_intel_uncore_frequency(void) fprintf(outf, " %d MHz\n", k / 1000); } } - return; +} + +static void probe_intel_uncore_frequency_cluster(void) +{ + int i, uncore_max_id; + char path[256]; + char path_base[128]; -probe_cluster: if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) return; if (quiet) return; - for (i = 0;; ++i) { + for (uncore_max_id = 0;; ++uncore_max_id) { + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id); + + /* uncore## start at 00 and skips no numbers, so stop upon first missing */ + if (access(path_base, R_OK)) { + uncore_max_id -= 1; + break; + } + } + for (i = uncore_max_id; i >= 0; --i) { int k, l; - char path_base[128]; int package_id, domain_id, cluster_id; + char name_buf[16]; sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); if (access(path_base, R_OK)) - break; + err(1, "%s: %s\n", __func__, path_base); sprintf(path, "%s/package_id", path_base); package_id = read_sysfs_int(path); @@ -5379,9 +5743,25 @@ probe_cluster: sprintf(path, "%s/current_freq_khz", path_base); k = read_sysfs_int(path); fprintf(outf, " %d MHz\n", k / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id); + + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); } } +static void probe_intel_uncore_frequency(void) +{ + if (!genuine_intel) + return; + + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0) + probe_intel_uncore_frequency_cluster(); + else + probe_intel_uncore_frequency_legacy(); +} + static void probe_graphics(void) { /* Xe graphics sysfs knobs */ @@ -5466,7 +5846,6 @@ next: else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) @@ -6405,7 +6784,8 @@ bool is_aperf_access_required(void) return BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC); + || BIC_IS_ENABLED(BIC_IPC) + || BIC_IS_ENABLED(BIC_CPU_c1); } int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, @@ -6478,17 +6858,18 @@ void linux_perf_init(void) void rapl_perf_init(void) { - const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1; bool *domain_visited = calloc(num_domains, sizeof(bool)); rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); if (rapl_counter_info_perdomain == NULL) err(-1, "calloc rapl_counter_info_percpu"); + rapl_counter_info_perdomain_size = num_domains; /* * Initialize rapl_counter_info_percpu */ - for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; rci->fd_perf = -1; @@ -6508,7 +6889,7 @@ void rapl_perf_init(void) bool has_counter = 0; double scale; enum rapl_unit unit; - int next_domain; + unsigned int next_domain; memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); @@ -6521,6 +6902,8 @@ void rapl_perf_init(void) next_domain = platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + assert(next_domain < num_domains); + if (domain_visited[next_domain]) continue; @@ -6634,42 +7017,160 @@ static int has_amperf_access(void) return 0; } -void probe_cstates(void) +int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) { - probe_cst_limit(); + if (strcmp(group_name, "cstate_core") == 0) + return &cci->fd_perf_core; - if (platform->supported_cstates & CC1) - BIC_PRESENT(BIC_CPU_c1); + if (strcmp(group_name, "cstate_pkg") == 0) + return &cci->fd_perf_pkg; - if (platform->supported_cstates & CC3) - BIC_PRESENT(BIC_CPU_c3); + return NULL; +} - if (platform->supported_cstates & CC6) - BIC_PRESENT(BIC_CPU_c6); +int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) +{ + if (no_perf) + return -1; - if (platform->supported_cstates & CC7) - BIC_PRESENT(BIC_CPU_c7); + int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys); - if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); + if (pfd_group == NULL) + return -1; - if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) - BIC_PRESENT(BIC_Pkgpc3); + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); - if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) - BIC_PRESENT(BIC_Pkgpc6); + const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); - if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); + if (fd_counter == -1) + return -1; - if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) - BIC_PRESENT(BIC_Pkgpc8); + /* If it's the first counter opened, make it a group descriptor */ + if (*pfd_group == -1) + *pfd_group = fd_counter; - if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) - BIC_PRESENT(BIC_Pkgpc9); + return fd_counter; +} - if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) - BIC_PRESENT(BIC_Pkgpc10); +int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai) +{ + int ret = add_cstate_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); + + return ret; +} + +void cstate_perf_init_(bool soft_c1) +{ + bool has_counter; + bool *cores_visited = NULL, *pkg_visited = NULL; + const int cores_visited_elems = topo.max_core_id + 1; + const int pkg_visited_elems = topo.max_package_id + 1; + const int cci_num = topo.max_cpu_num + 1; + + ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info)); + if (!ccstate_counter_info) + err(1, "calloc ccstate_counter_arch_info"); + ccstate_counter_info_size = cci_num; + + cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited)); + if (!cores_visited) + err(1, "calloc cores_visited"); + + pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited)); + if (!pkg_visited) + err(1, "calloc pkg_visited"); + + /* Initialize cstate_counter_info_percpu */ + for (int cpu = 0; cpu < cci_num; ++cpu) { + ccstate_counter_info[cpu].fd_perf_core = -1; + ccstate_counter_info[cpu].fd_perf_pkg = -1; + } + + for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) { + has_counter = false; + memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited)); + memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited)); + + const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx]; + + for (int cpu = 0; cpu < cci_num; ++cpu) { + + struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + const int core_id = cpus[cpu].physical_core_id; + const int pkg_id = cpus[cpu].physical_package_id; + + assert(core_id < cores_visited_elems); + assert(pkg_id < pkg_visited_elems); + + const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD; + const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE; + + if (!per_thread && cores_visited[core_id]) + continue; + + if (!per_core && pkg_visited[pkg_id]) + continue; + + const bool counter_needed = BIC_IS_ENABLED(cai->bic) || + (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY)); + const bool counter_supported = (platform->supported_cstates & cai->feature_mask); + + if (counter_needed && counter_supported) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { + + cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit + && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + } + } + + if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + has_counter = true; + cores_visited[core_id] = true; + pkg_visited[pkg_id] = true; + } + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(cores_visited); + free(pkg_visited); +} + +void cstate_perf_init(void) +{ + /* + * If we don't have a C1 residency MSR, we calculate it "in software", + * but we need APERF, MPERF too. + */ + const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access() + && platform->supported_cstates & CC1; + + if (soft_c1) + BIC_PRESENT(BIC_CPU_c1); + + cstate_perf_init_(soft_c1); +} + +void probe_cstates(void) +{ + probe_cst_limit(); if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); @@ -6927,6 +7428,22 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); } +static void counter_info_init(void) +{ + for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) { + struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i]; + + if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY) + cai->msr = MSR_KNL_CORE_C6_RESIDENCY; + + if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES) + cai->msr = 0; + + if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) + cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; + } +} + void probe_pm_features(void) { probe_pstates(); @@ -6967,7 +7484,6 @@ void topology_probe(bool startup) int i; int max_core_id = 0; int max_package_id = 0; - int max_die_id = 0; int max_siblings = 0; /* Initialize num_cpus, max_cpu_num */ @@ -7084,8 +7600,8 @@ void topology_probe(bool startup) /* get die information */ cpus[i].die_id = get_die_id(i); - if (cpus[i].die_id > max_die_id) - max_die_id = cpus[i].die_id; + if (cpus[i].die_id > topo.max_die_id) + topo.max_die_id = cpus[i].die_id; /* get numa node information */ cpus[i].physical_node_id = get_physical_node_id(&cpus[i]); @@ -7104,6 +7620,8 @@ void topology_probe(bool startup) if (cpus[i].thread_id == 0) topo.num_cores++; } + topo.max_core_id = max_core_id; + topo.max_package_id = max_package_id; topo.cores_per_node = max_core_id + 1; if (debug > 1) @@ -7111,9 +7629,9 @@ void topology_probe(bool startup) if (!summary_only && topo.cores_per_node > 1) BIC_PRESENT(BIC_Core); - topo.num_die = max_die_id + 1; + topo.num_die = topo.max_die_id + 1; if (debug > 1) - fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die); + fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die); if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); @@ -7403,10 +7921,12 @@ void turbostat_init() check_msr_access(); check_perf_access(); process_cpuid(); + counter_info_init(); probe_pm_features(); set_amperf_source(); linux_perf_init(); rapl_perf_init(); + cstate_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -7497,7 +8017,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.04.08 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2024.05.10 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -7523,61 +8043,114 @@ void print_bootcmd(void) fclose(fp); } +struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) +{ + struct msr_counter *mp; + + for (mp = head; mp; mp = mp->next) { + if (debug) + printf("%s: %s %s\n", __func__, name, mp->name); + if (!strncmp(name, mp->name, strlen(mp->name))) + return mp; + } + return NULL; +} + int add_counter(unsigned int msr_num, char *path, char *name, unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + enum counter_type type, enum counter_format format, int flags, int id) { struct msr_counter *msrp; if (no_msr && msr_num) errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); - msrp = calloc(1, sizeof(struct msr_counter)); - if (msrp == NULL) { - perror("calloc"); - exit(1); - } - - msrp->msr_num = msr_num; - strncpy(msrp->name, name, NAME_BYTES - 1); - if (path) - strncpy(msrp->path, path, PATH_BYTES - 1); - msrp->width = width; - msrp->type = type; - msrp->format = format; - msrp->flags = flags; + if (debug) + printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, + path, name, width, scope, type, format, flags, id); switch (scope) { case SCOPE_CPU: - msrp->next = sys.tp; - sys.tp = msrp; - sys.added_thread_counters++; - if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.tp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter %s", name); + return -1; } break; - case SCOPE_CORE: - msrp->next = sys.cp; - sys.cp = msrp; - sys.added_core_counters++; - if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.cp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter %s", name); + return -1; } break; - case SCOPE_PACKAGE: - msrp->next = sys.pp; - sys.pp = msrp; - sys.added_package_counters++; - if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); - exit(-1); + msrp = find_msrp_by_name(sys.pp, name); + if (msrp) { + if (debug) + printf("%s: %s FOUND\n", __func__, name); + break; + } + if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter %s", name); + return -1; } break; + default: + warnx("ignoring counter %s with unknown scope", name); + return -1; + } + + if (msrp == NULL) { + msrp = calloc(1, sizeof(struct msr_counter)); + if (msrp == NULL) + err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; + strncpy(msrp->name, name, NAME_BYTES - 1); + msrp->width = width; + msrp->type = type; + msrp->format = format; + msrp->flags = flags; + + switch (scope) { + case SCOPE_CPU: + msrp->next = sys.tp; + sys.tp = msrp; + break; + case SCOPE_CORE: + msrp->next = sys.cp; + sys.cp = msrp; + break; + case SCOPE_PACKAGE: + msrp->next = sys.pp; + sys.pp = msrp; + break; + } + } + + if (path) { + struct sysfs_path *sp; + + sp = calloc(1, sizeof(struct sysfs_path)); + if (sp == NULL) { + perror("calloc"); + exit(1); + } + strncpy(sp->path, path, PATH_BYTES - 1); + sp->id = id; + sp->next = msrp->sp; + msrp->sp = sp; } return 0; @@ -7679,7 +8252,7 @@ next: sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0)) + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) fail++; if (fail) { @@ -7744,7 +8317,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); } for (state = 10; state >= 0; --state) { @@ -7772,7 +8345,7 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0); } } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index e9ea38aa8248..09a8e6f9b379 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -653,7 +653,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, cmpstr = "(struct file_operations){\n" " .owner = (struct module *)0xffffffffffffffff,\n" -" .llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,"; +" .fop_flags = (fop_flags_t)4294967295,"; ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations"); } diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index da30f0d59364..ab0ce1d01a4a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -110,10 +110,14 @@ DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_map, array_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_map, array_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, hash_malloc_map, array_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, lru_hash_map, array_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_array_map, array_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_ARRAY_OF_MAPS, pcpu_hash_map, array_of_pcpu_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, array_map, hash_of_array_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_map, hash_of_hash_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, hash_malloc_map, hash_of_hash_malloc_maps); DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, lru_hash_map, hash_of_lru_hash_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_array_map, hash_of_pcpu_array_maps); +DEFINE_MAP_OF_MAP(BPF_MAP_TYPE_HASH_OF_MAPS, pcpu_hash_map, hash_of_pcpu_hash_maps); #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) @@ -204,6 +208,8 @@ int test_map_kptr(struct __sk_buff *ctx) TEST(hash_map); TEST(hash_malloc_map); TEST(lru_hash_map); + TEST(pcpu_array_map); + TEST(pcpu_hash_map); #undef TEST return 0; @@ -281,10 +287,14 @@ int test_map_in_map_kptr(struct __sk_buff *ctx) TEST(array_of_hash_maps); TEST(array_of_hash_malloc_maps); TEST(array_of_lru_hash_maps); + TEST(array_of_pcpu_array_maps); + TEST(array_of_pcpu_hash_maps); TEST(hash_of_array_maps); TEST(hash_of_hash_maps); TEST(hash_of_hash_malloc_maps); TEST(hash_of_lru_hash_maps); + TEST(hash_of_pcpu_array_maps); + TEST(hash_of_pcpu_hash_maps); #undef TEST return 0; diff --git a/tools/testing/selftests/bpf/progs/test_access_variable_array.c b/tools/testing/selftests/bpf/progs/test_access_variable_array.c index 808c49b79889..326b7d1f496a 100644 --- a/tools/testing/selftests/bpf/progs/test_access_variable_array.c +++ b/tools/testing/selftests/bpf/progs/test_access_variable_array.c @@ -7,7 +7,7 @@ unsigned long span = 0; -SEC("fentry/load_balance") +SEC("fentry/sched_balance_rq") int BPF_PROG(fentry_fentry, int this_cpu, struct rq *this_rq, struct sched_domain *sd) { diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index fa571bfd2432..190096017f80 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,7 +50,7 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_cg_wb_count(const char *cg) +static long get_cg_wb_count(const char *cg) { return cg_read_key_long(cg, "memory.stat", "zswpwb"); } @@ -249,6 +249,132 @@ out: } /* + * Attempt writeback with the following steps: + * 1. Allocate memory. + * 2. Reclaim memory equal to the amount that was allocated in step 1. + This will move it into zswap. + * 3. Save current zswap usage. + * 4. Move the memory allocated in step 1 back in from zswap. + * 5. Set zswap.max to half the amount that was recorded in step 3. + * 6. Attempt to reclaim memory equal to the amount that was allocated, + this will either trigger writeback if it's enabled, or reclamation + will fail if writeback is disabled as there isn't enough zswap space. + */ +static int attempt_writeback(const char *cgroup, void *arg) +{ + long pagesize = sysconf(_SC_PAGESIZE); + char *test_group = arg; + size_t memsize = MB(4); + char buf[pagesize]; + long zswap_usage; + bool wb_enabled; + int ret = -1; + char *mem; + + wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); + mem = (char *)malloc(memsize); + if (!mem) + return ret; + + /* + * Fill half of each page with increasing data, and keep other + * half empty, this will result in data that is still compressible + * and ends up in zswap, with material zswap usage. + */ + for (int i = 0; i < pagesize; i++) + buf[i] = i < pagesize/2 ? (char) i : 0; + + for (int i = 0; i < memsize; i += pagesize) + memcpy(&mem[i], buf, pagesize); + + /* Try and reclaim allocated memory */ + if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { + ksft_print_msg("Failed to reclaim all of the requested memory\n"); + goto out; + } + + zswap_usage = cg_read_long(test_group, "memory.zswap.current"); + + /* zswpin */ + for (int i = 0; i < memsize; i += pagesize) { + if (memcmp(&mem[i], buf, pagesize)) { + ksft_print_msg("invalid memory\n"); + goto out; + } + } + + if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) + goto out; + + /* + * If writeback is enabled, trying to reclaim memory now will trigger a + * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * If writeback is disabled, memory reclaim will fail as zswap is limited and + * it can't writeback to swap. + */ + ret = cg_write_numeric(test_group, "memory.reclaim", memsize); + if (!wb_enabled) + ret = (ret == -EAGAIN) ? 0 : -1; + +out: + free(mem); + return ret; +} + +/* Test to verify the zswap writeback path */ +static int test_zswap_writeback(const char *root, bool wb) +{ + long zswpwb_before, zswpwb_after; + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "zswap_writeback_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) + goto out; + + zswpwb_before = get_cg_wb_count(test_group); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + goto out; + } + + if (cg_run(test_group, attempt_writeback, (void *) test_group)) + goto out; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(test_group); + if (zswpwb_after < 0) + goto out; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s", + zswpwb_after, wb ? "enabled" : "disabled"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +static int test_zswap_writeback_enabled(const char *root) +{ + return test_zswap_writeback(root, true); +} + +static int test_zswap_writeback_disabled(const char *root) +{ + return test_zswap_writeback(root, false); +} + +/* * When trying to store a memcg page in zswap, if the memcg hits its memory * limit in zswap, writeback should affect only the zswapped pages of that * memcg. @@ -364,8 +490,6 @@ static int test_no_kmem_bypass(const char *root) trigger_allocation_size = sys_info.totalram / 20; /* Set up test memcg */ - if (cg_write(root, "cgroup.subtree_control", "+memory")) - goto out; test_group = cg_name(root, "kmem_bypass_test"); if (!test_group) goto out; @@ -425,6 +549,8 @@ struct zswap_test { T(test_zswap_usage), T(test_swapin_nozswap), T(test_zswapin), + T(test_zswap_writeback_enabled), + T(test_zswap_writeback_disabled), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), }; diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 789d6949c247..29a22f50e762 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -7,16 +7,21 @@ TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh + +# functionality tests TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh +TEST_PROGS += sysfs.sh +TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py +TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py +TEST_PROGS += reclaim.sh lru_sort.sh + +# regression tests (reproducers of previously found bugs) TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh TEST_PROGS += debugfs_target_ids_pid_leak.sh -TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py -TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py -TEST_PROGS += damos_quota.py damos_apply_interval.py -TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index d23d7398a27a..2bd44c32be1b 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -2,7 +2,18 @@ import os -sysfs_root = '/sys/kernel/mm/damon/admin' +ksft_skip=4 + +sysfs_root = None +with open('/proc/mounts', 'r') as f: + for line in f: + dev_name, mount_point, dev_fs = line.split()[:3] + if dev_fs == 'sysfs': + sysfs_root = '%s/kernel/mm/damon/admin' % mount_point + break +if sysfs_root is None: + print('Seems sysfs not mounted?') + exit(ksft_skip) def write_file(path, string): "Returns error string if failed, or None otherwise" @@ -34,11 +45,11 @@ class DamosAccessPattern: self.nr_accesses = nr_accesses self.age = age - if self.size == None: + if self.size is None: self.size = [0, 2**64 - 1] - if self.nr_accesses == None: + if self.nr_accesses is None: self.nr_accesses = [0, 2**64 - 1] - if self.age == None: + if self.age is None: self.age = [0, 2**64 - 1] def sysfs_dir(self): @@ -47,55 +58,109 @@ class DamosAccessPattern: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'min'), self.size[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'sz', 'max'), self.size[1]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'min'), self.nr_accesses[0]) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'max'), self.nr_accesses[1]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'min'), self.age[0]) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.sysfs_dir(), 'age', 'max'), self.age[1]) - if err != None: + if err is not None: return err +qgoal_metric_user_input = 'user_input' +qgoal_metric_some_mem_psi_us = 'some_mem_psi_us' +qgoal_metrics = [qgoal_metric_user_input, qgoal_metric_some_mem_psi_us] + +class DamosQuotaGoal: + metric = None + target_value = None + current_value = None + effective_bytes = None + quota = None # owner quota + idx = None + + def __init__(self, metric, target_value=10000, current_value=0): + self.metric = metric + self.target_value = target_value + self.current_value = current_value + + def sysfs_dir(self): + return os.path.join(self.quota.sysfs_dir(), 'goals', '%d' % self.idx) + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'target_metric'), + self.metric) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'target_value'), + self.target_value) + if err is not None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'current_value'), + self.current_value) + if err is not None: + return err + return None + class DamosQuota: sz = None # size quota, in bytes ms = None # time quota + goals = None # quota goals reset_interval_ms = None # quota reset interval scheme = None # owner scheme - def __init__(self, sz=0, ms=0, reset_interval_ms=0): + def __init__(self, sz=0, ms=0, goals=None, reset_interval_ms=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms + self.goals = goals if goals is not None else [] + for idx, goal in enumerate(self.goals): + goal.idx = idx + goal.quota = self def sysfs_dir(self): return os.path.join(self.scheme.sysfs_dir(), 'quotas') def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'), self.reset_interval_ms) - if err != None: + if err is not None: return err + nr_goals_file = os.path.join(self.sysfs_dir(), 'goals', 'nr_goals') + content, err = read_file(nr_goals_file) + if err is not None: + return err + if int(content) != len(self.goals): + err = write_file(nr_goals_file, len(self.goals)) + if err is not None: + return err + for goal in self.goals: + err = goal.stage() + if err is not None: + return err + return None + class DamosStats: nr_tried = None sz_tried = None @@ -136,30 +201,30 @@ class Damos: def stage(self): err = write_file(os.path.join(self.sysfs_dir(), 'action'), self.action) - if err != None: + if err is not None: return err err = self.access_pattern.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'), '%d' % self.apply_interval_us) - if err != None: + if err is not None: return err err = self.quota.stage() - if err != None: + if err is not None: return err # disable watermarks err = write_file( os.path.join(self.sysfs_dir(), 'watermarks', 'metric'), 'none') - if err != None: + if err is not None: return err # disable filters err = write_file( os.path.join(self.sysfs_dir(), 'filters', 'nr_filters'), '0') - if err != None: + if err is not None: return err class DamonTarget: @@ -178,7 +243,7 @@ class DamonTarget: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') - if err != None: + if err is not None: return err return write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) @@ -210,27 +275,27 @@ class DamonAttrs: def stage(self): err = write_file(os.path.join(self.interval_sysfs_dir(), 'sample_us'), self.sample_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'aggr_us'), self.aggr_us) - if err != None: + if err is not None: return err err = write_file(os.path.join(self.interval_sysfs_dir(), 'update_us'), self.update_us) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'min'), self.min_nr_regions) - if err != None: + if err is not None: return err err = write_file( os.path.join(self.nr_regions_range_sysfs_dir(), 'max'), self.max_nr_regions) - if err != None: + if err is not None: return err class DamonCtx: @@ -264,36 +329,38 @@ class DamonCtx: def stage(self): err = write_file( os.path.join(self.sysfs_dir(), 'operations'), self.ops) - if err != None: + if err is not None: return err err = self.monitoring_attrs.stage() - if err != None: + if err is not None: return err nr_targets_file = os.path.join( self.sysfs_dir(), 'targets', 'nr_targets') content, err = read_file(nr_targets_file) - if err != None: + if err is not None: return err if int(content) != len(self.targets): err = write_file(nr_targets_file, '%d' % len(self.targets)) - if err != None: + if err is not None: return err for target in self.targets: err = target.stage() - if err != None: + if err is not None: return err nr_schemes_file = os.path.join( self.sysfs_dir(), 'schemes', 'nr_schemes') content, err = read_file(nr_schemes_file) + if err is not None: + return err if int(content) != len(self.schemes): err = write_file(nr_schemes_file, '%d' % len(self.schemes)) - if err != None: + if err is not None: return err for scheme in self.schemes: err = scheme.stage() - if err != None: + if err is not None: return err return None @@ -317,16 +384,16 @@ class Kdamond: nr_contexts_file = os.path.join(self.sysfs_dir(), 'contexts', 'nr_contexts') content, err = read_file(nr_contexts_file) - if err != None: + if err is not None: return err if int(content) != len(self.contexts): err = write_file(nr_contexts_file, '%d' % len(self.contexts)) - if err != None: + if err is not None: return err for context in self.contexts: err = context.stage() - if err != None: + if err is not None: return err err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err @@ -334,20 +401,20 @@ class Kdamond: def update_schemes_tried_bytes(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_tried_bytes') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: content, err = read_file(os.path.join(scheme.sysfs_dir(), 'tried_regions', 'total_bytes')) - if err != None: + if err is not None: return err scheme.tried_bytes = int(content) def update_schemes_stats(self): err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'update_schemes_stats') - if err != None: + if err is not None: return err for context in self.contexts: for scheme in context.schemes: @@ -356,11 +423,39 @@ class Kdamond: 'sz_applied', 'qt_exceeds']: content, err = read_file( os.path.join(scheme.sysfs_dir(), 'stats', stat)) - if err != None: + if err is not None: return err stat_values.append(int(content)) scheme.stats = DamosStats(*stat_values) + def update_schemes_effective_quotas(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_effective_quotas') + if err is not None: + return err + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + content, err = read_file( + os.path.join(scheme.quota.sysfs_dir(), + 'effective_bytes')) + if err is not None: + return err + goal.effective_bytes = int(content) + return None + + def commit_schemes_quota_goals(self): + for context in self.contexts: + for scheme in context.schemes: + for goal in scheme.quota.goals: + err = goal.stage() + if err is not None: + print('commit_schemes_quota_goals failed stagign: %s'% + err) + exit(1) + return write_file(os.path.join(self.sysfs_dir(), 'state'), + 'commit_schemes_quota_goals') + class Kdamonds: kdamonds = [] @@ -376,10 +471,10 @@ class Kdamonds: def start(self): err = write_file(os.path.join(self.sysfs_dir(), 'nr_kdamonds'), '%s' % len(self.kdamonds)) - if err != None: + if err is not None: return err for kdamond in self.kdamonds: err = kdamond.start() - if err != None: + if err is not None: return err return None diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py new file mode 100644 index 000000000000..18246f3b62f7 --- /dev/null +++ b/tools/testing/selftests/damon/damos_quota_goal.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # access two 10 MiB memory regions, 2 second per each + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) + + goal = _damon_sysfs.DamosQuotaGoal( + metric=_damon_sysfs.qgoal_metric_user_input, target_value=10000) + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos( + action='stat', + quota=_damon_sysfs.DamosQuota( + goals=[goal], reset_interval_ms=100), + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err != None: + print('kdamond start failed: %s' % err) + exit(1) + + score_values_to_test = [0, 15000, 5000, 18000] + while proc.poll() == None: + if len(score_values_to_test) == 0: + time.sleep(0.1) + continue + + goal.current_value = score_values_to_test.pop(0) + expect_increase = goal.current_value < goal.target_value + + err = kdamonds.kdamonds[0].commit_schemes_quota_goals() + if err is not None: + print('commit_schemes_quota_goals failed: %s' % err) + exit(1) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('before-update_schemes_effective_quotas failed: %s' % err) + exit(1) + last_effective_bytes = goal.effective_bytes + + time.sleep(0.5) + + err = kdamonds.kdamonds[0].update_schemes_effective_quotas() + if err is not None: + print('after-update_schemes_effective_quotas failed: %s' % err) + exit(1) + + print('score: %s, effective quota: %d -> %d (%.3fx)' % ( + goal.current_value, last_effective_bytes, goal.effective_bytes, + goal.effective_bytes / last_effective_bytes + if last_effective_bytes != 0 else -1.0)) + + if last_effective_bytes == goal.effective_bytes: + print('efective bytes not changed: %d' % goal.effective_bytes) + exit(1) + + increased = last_effective_bytes < goal.effective_bytes + if expect_increase != increased: + print('expectation of increase (%s) != increased (%s)' % + (expect_increase, increased)) + exit(1) + last_effective_bytes = goal.effective_bytes + +if __name__ == '__main__': + main() diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc new file mode 100644 index 000000000000..c6a9d2466a71 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -0,0 +1,41 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Fprobe event VFS type argument +# requires: dynamic_events "%pd/%pD":README "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README + + +: "Test argument %pd with name for fprobe" +echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pd without name for fprobe" +echo 'f:testprobe dput $arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD with name for fprobe" +echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD without name for fprobe" +echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index 000000000000..21a54be6894c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name" +echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo 'p:testprobe dput $arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c index 25110c7c0b3e..d7a8e321bb16 100644 --- a/tools/testing/selftests/kcmp/kcmp_test.c +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -91,7 +91,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(3); - fd2 = open(kpath, O_RDWR, 0644); + fd2 = open(kpath, O_RDWR); if (fd2 < 0) { perror("Can't open file"); ksft_exit_fail(); diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index a6f89aaea77d..3c1e9f35b531 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -75,7 +75,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(4, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(5, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 27744524df51..6b5a9ff88c3d 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -8,22 +8,34 @@ */ #define _GNU_SOURCE +#include <asm/termbits.h> #include <fcntl.h> #include <libgen.h> +#include <linux/fiemap.h> #include <linux/landlock.h> #include <linux/magic.h> #include <sched.h> +#include <stddef.h> #include <stdio.h> #include <string.h> #include <sys/capability.h> +#include <sys/ioctl.h> #include <sys/mount.h> #include <sys/prctl.h> #include <sys/sendfile.h> +#include <sys/socket.h> #include <sys/stat.h> #include <sys/sysmacros.h> +#include <sys/un.h> #include <sys/vfs.h> #include <unistd.h> +/* + * Intentionally included last to work around header conflict. + * See https://sourceware.org/glibc/wiki/Synchronizing_Headers. + */ +#include <linux/fs.h> + #include "common.h" #ifndef renameat2 @@ -536,9 +548,10 @@ TEST_F_FORK(layout1, inval) LANDLOCK_ACCESS_FS_EXECUTE | \ LANDLOCK_ACCESS_FS_WRITE_FILE | \ LANDLOCK_ACCESS_FS_READ_FILE | \ - LANDLOCK_ACCESS_FS_TRUNCATE) + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) -#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE +#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL_DEV #define ACCESS_ALL ( \ ACCESS_FILE | \ @@ -743,6 +756,9 @@ static int create_ruleset(struct __test_metadata *const _metadata, } for (i = 0; rules[i].path; i++) { + if (!rules[i].access) + continue; + add_path_beneath(_metadata, ruleset_fd, rules[i].access, rules[i].path); } @@ -3451,7 +3467,7 @@ TEST_F_FORK(layout1, truncate_unhandled) LANDLOCK_ACCESS_FS_WRITE_FILE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3534,7 +3550,7 @@ TEST_F_FORK(layout1, truncate) LANDLOCK_ACCESS_FS_TRUNCATE; int ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, handled, rules); ASSERT_LE(0, ruleset_fd); @@ -3760,7 +3776,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate) }; int fd, ruleset_fd; - /* Enable Landlock. */ + /* Enables Landlock. */ ruleset_fd = create_ruleset(_metadata, variant->handled, rules); ASSERT_LE(0, ruleset_fd); enforce_ruleset(_metadata, ruleset_fd); @@ -3837,22 +3853,471 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes) ASSERT_EQ(0, close(socket_fds[1])); } -TEST(memfd_ftruncate) +/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */ +static int test_fs_ioc_getflags_ioctl(int fd) { - int fd; + uint32_t flags; + + if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0) + return errno; + return 0; +} + +TEST(memfd_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd, i; + + /* + * We exercise the same test both with and without Landlock enabled, to + * ensure that it behaves the same in both cases. + */ + for (i = 0; i < 2; i++) { + /* Creates a new memfd. */ + fd = memfd_create("name", MFD_CLOEXEC); + ASSERT_LE(0, fd); + + /* + * Checks that operations associated with the opened file + * (ftruncate, ioctl) are permitted on file descriptors that are + * created in ways other than open(2). + */ + EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + } +} + +static int test_fionread_ioctl(int fd) +{ + size_t sz = 0; + + if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES) + return errno; + return 0; +} + +TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = ACCESS_ALL, + }; + int ruleset_fd, fd; + + /* + * Checks that for files opened with O_PATH, both ioctl(2) and + * ftruncate(2) yield EBADF, as it is documented in open(2) for the + * O_PATH flag. + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Checks that after enabling Landlock, + * - the file can still be opened with O_PATH + * - both ioctl and truncate still yield EBADF (not EACCES). + */ + fd = open(dir_s1d1, O_PATH | O_CLOEXEC); + ASSERT_LE(0, fd); + + EXPECT_EQ(EBADF, test_ftruncate(fd)); + EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); +} + +/* + * ioctl_error - generically call the given ioctl with a pointer to a + * sufficiently large zeroed-out memory region. + * + * Returns the IOCTLs error, or 0. + */ +static int ioctl_error(struct __test_metadata *const _metadata, int fd, + unsigned int cmd) +{ + char buf[128]; /* sufficiently large */ + int res, stdinbak_fd; + + /* + * Depending on the IOCTL command, parts of the zeroed-out buffer might + * be interpreted as file descriptor numbers. We do not want to + * accidentally operate on file descriptor 0 (stdin), so we temporarily + * move stdin to a different FD and close FD 0 for the IOCTL call. + */ + stdinbak_fd = dup(0); + ASSERT_LT(0, stdinbak_fd); + ASSERT_EQ(0, close(0)); + + /* Invokes the IOCTL with a zeroed-out buffer. */ + bzero(&buf, sizeof(buf)); + res = ioctl(fd, cmd, &buf); + + /* Restores the old FD 0 and closes the backup FD. */ + ASSERT_EQ(0, dup2(stdinbak_fd, 0)); + ASSERT_EQ(0, close(stdinbak_fd)); + + if (res < 0) + return errno; + + return 0; +} + +/* Define some linux/falloc.h IOCTL commands which are not available in uapi headers. */ +struct space_resv { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserved area */ +}; + +#define FS_IOC_RESVSP _IOW('X', 40, struct space_resv) +#define FS_IOC_UNRESVSP _IOW('X', 41, struct space_resv) +#define FS_IOC_RESVSP64 _IOW('X', 42, struct space_resv) +#define FS_IOC_UNRESVSP64 _IOW('X', 43, struct space_resv) +#define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) + +/* + * Tests a series of blanket-permitted and denied IOCTLs. + */ +TEST_F_FORK(layout1, blanket_permitted_ioctls) +{ + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + int ruleset_fd, fd; - fd = memfd_create("name", MFD_CLOEXEC); + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + fd = open("/dev/null", O_RDWR | O_CLOEXEC); ASSERT_LE(0, fd); /* - * Checks that ftruncate is permitted on file descriptors that are - * created in ways other than open(2). + * Checks permitted commands. + * These ones may return errors, but should not be blocked by Landlock. + */ + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONCLEX)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIONBIO)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOASYNC)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIOQSIZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIFREEZE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FITHAW)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_FIEMAP)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIGETBSZ)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FICLONERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FIDEDUPERANGE)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSUUID)); + EXPECT_NE(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFSSYSFSPATH)); + + /* + * Checks blocked commands. + * A call to a blocked IOCTL command always returns EACCES. */ - EXPECT_EQ(0, test_ftruncate(fd)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIONREAD)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_GETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_SETFLAGS)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSGETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_FSSETXATTR)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FIBMAP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_RESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_UNRESVSP64)); + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, FS_IOC_ZERO_RANGE)); + + /* Default case is also blocked. */ + EXPECT_EQ(EACCES, ioctl_error(_metadata, fd, 0xc00ffeee)); ASSERT_EQ(0, close(fd)); } +/* + * Named pipes are not governed by the LANDLOCK_ACCESS_FS_IOCTL_DEV right, + * because they are not character or block devices. + */ +TEST_F_FORK(layout1, named_pipe_ioctl) +{ + pid_t child_pid; + int fd, ruleset_fd; + const char *const path = file1_s1d1; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + ASSERT_EQ(0, unlink(path)); + ASSERT_EQ(0, mkfifo(path, 0600)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* The child process opens the pipe for writing. */ + child_pid = fork(); + ASSERT_NE(-1, child_pid); + if (child_pid == 0) { + fd = open(path, O_WRONLY); + close(fd); + exit(0); + } + + fd = open(path, O_RDONLY); + ASSERT_LE(0, fd); + + /* FIONREAD is implemented by pipefifo_fops. */ + EXPECT_EQ(0, test_fionread_ioctl(fd)); + + ASSERT_EQ(0, close(fd)); + ASSERT_EQ(0, unlink(path)); + + ASSERT_EQ(child_pid, waitpid(child_pid, NULL, 0)); +} + +/* For named UNIX domain sockets, no IOCTL restrictions apply. */ +TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) +{ + const char *const path = file1_s1d1; + int srv_fd, cli_fd, ruleset_fd; + socklen_t size; + struct sockaddr_un srv_un, cli_un; + const struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, + }; + + /* Sets up a server */ + srv_un.sun_family = AF_UNIX; + strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); + + ASSERT_EQ(0, unlink(path)); + srv_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, srv_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path); + ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size)); + ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */)); + + /* Enables Landlock. */ + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Sets up a client connection to it */ + cli_un.sun_family = AF_UNIX; + cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, cli_fd); + + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size)); + + bzero(&cli_un, sizeof(cli_un)); + cli_un.sun_family = AF_UNIX; + strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); + size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); + + ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size)); + + /* FIONREAD and other IOCTLs should not be forbidden. */ + EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); + + ASSERT_EQ(0, close(cli_fd)); +} + +/* clang-format off */ +FIXTURE(ioctl) {}; + +FIXTURE_SETUP(ioctl) {}; + +FIXTURE_TEARDOWN(ioctl) {}; +/* clang-format on */ + +FIXTURE_VARIANT(ioctl) +{ + const __u64 handled; + const __u64 allowed; + const mode_t open_mode; + /* + * FIONREAD is used as a characteristic device-specific IOCTL command. + * It is implemented in fs/ioctl.c for regular files, + * but we do not blanket-permit it for devices. + */ + const int expected_fionread_result; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = 0, + .open_mode = O_RDWR, + .expected_fionread_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .allowed = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(ioctl, unhandled) { + /* clang-format on */ + .handled = LANDLOCK_ACCESS_FS_EXECUTE, + .allowed = LANDLOCK_ACCESS_FS_EXECUTE, + .open_mode = O_RDWR, + .expected_fionread_result = 0, +}; + +TEST_F_FORK(ioctl, handle_dir_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd); + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + +TEST_F_FORK(ioctl, handle_dir_access_dir) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev", + .access = variant->allowed, + }, + {}, + }; + int dir_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Ignore variant->open_mode for this test, as we intend to open a + * directory. If the directory can not be opened, the variant is + * infeasible to test with an opened directory. + */ + dir_fd = open("/dev", O_RDONLY); + if (dir_fd < 0) + return; + + /* + * Checks that IOCTL commands return the expected errors. + * We do not use the expected values from the fixture here. + * + * When using IOCTL on a directory, no Landlock restrictions apply. + */ + EXPECT_EQ(0, test_fionread_ioctl(dir_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(dir_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(dir_fd)); +} + +TEST_F_FORK(ioctl, handle_file_access_file) +{ + const int flag = 0; + const struct rule rules[] = { + { + .path = "/dev/zero", + .access = variant->allowed, + }, + {}, + }; + int file_fd, ruleset_fd; + + /* Enables Landlock. */ + ruleset_fd = create_ruleset(_metadata, variant->handled, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + file_fd = open("/dev/zero", variant->open_mode); + ASSERT_LE(0, file_fd) + { + TH_LOG("Failed to open /dev/zero: %s", strerror(errno)); + } + + /* Checks that IOCTL commands return the expected errors. */ + EXPECT_EQ(variant->expected_fionread_result, + test_fionread_ioctl(file_fd)); + + /* Checks that unrestrictable commands are unrestricted. */ + EXPECT_EQ(0, ioctl(file_fd, FIOCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONCLEX)); + EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag)); + EXPECT_EQ(0, ioctl(file_fd, FIGETBSZ, &flag)); + + ASSERT_EQ(0, close(file_fd)); +} + /* clang-format off */ FIXTURE(layout1_bind) {}; /* clang-format on */ diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 09e8a854f0fc..429535816dbd 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -70,6 +70,15 @@ ifeq ($(KHDR_INCLUDES),) KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include endif +# In order to use newer items that haven't yet been added to the user's system +# header files, add $(TOOLS_INCLUDES) to the compiler invocation in each +# each selftest. +# You may need to add files to that location, or to refresh an existing file. In +# order to do that, run "make headers" from $(top_srcdir), then copy the +# header file that you want from $(top_srcdir)/usr/include/... , to the matching +# subdir in $(TOOLS_INCLUDE). +TOOLS_INCLUDES := -isystem $(top_srcdir)/tools/include/uapi + # The following are built by lib.mk common compile rules. # TEST_CUSTOM_PROGS should be used by tests that require # custom build rule and prevent common build rule use. diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 93798c8c5d54..dbc171a3806d 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -306,7 +306,7 @@ int main(int argc, char **argv) * then the kernel did a page-replacement or canceled the read() (or * whatever magic it did..). In that case, the memfd object is still * all zero. - * In case the memfd-object was *not* sealed, the read() was successfull + * In case the memfd-object was *not* sealed, the read() was successful * and the memfd object must *not* be all zero. * Note that in real scenarios, there might be a mixture of both, but * in this test-cases, we have explicit 200ms delays which should be diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 18f585684e20..95af2d78fd31 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1528,7 +1528,7 @@ static void test_share_open(char *banner, char *b_suffix) /* * Test sharing via fork() - * Test whether seal-modifications work as expected with forked childs. + * Test whether seal-modifications work as expected with forked children. */ static void test_share_fork(char *banner, char *b_suffix) { diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 410495e0a611..60b8feb6a5ec 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -32,7 +32,7 @@ endif # LDLIBS. MAKEFLAGS += --no-builtin-rules -CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) LDLIBS = -lrt -lpthread -lm TEST_GEN_FILES = cow diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index fe078d6e1806..32c6ccc2a6be 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -199,7 +199,7 @@ static int child_vmsplice_memcmp_fn(char *mem, size_t size, typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) + child_fn fn, bool xfail) { struct comm_pipes comm_pipes; char buf; @@ -247,33 +247,47 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, else ret = -EINVAL; - ksft_test_result(!ret, "No leak from parent into child\n"); + if (!ret) { + ksft_test_result_pass("No leak from parent into child\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from parent into child\n"); + } else { + ksft_test_result_fail("Leak from parent into child\n"); + } close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_cow_in_parent(char *mem, size_t size) +static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); } -static void test_cow_in_parent_mprotect(char *mem, size_t size) +static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); } -static void test_vmsplice_in_child(char *mem, size_t size) +static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb) { - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn, + is_hugetlb); } -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +static void test_vmsplice_in_child_mprotect(char *mem, size_t size, + bool is_hugetlb) { - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn, + is_hugetlb); } static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) + bool before_fork, bool xfail) { struct iovec iov = { .iov_base = mem, @@ -355,8 +369,18 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size, } } - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); + if (!memcmp(old, new, transferred)) { + ksft_test_result_pass("No leak from child into parent\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from child into parent\n"); + } else { + ksft_test_result_fail("Leak from child into parent\n"); + } close_pipe: close(fds[0]); close(fds[1]); @@ -367,14 +391,14 @@ free: free(new); } -static void test_vmsplice_before_fork(char *mem, size_t size) +static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, true); + do_test_vmsplice_in_parent(mem, size, true, is_hugetlb); } -static void test_vmsplice_after_fork(char *mem, size_t size) +static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb) { - do_test_vmsplice_in_parent(mem, size, false); + do_test_vmsplice_in_parent(mem, size, false, is_hugetlb); } #ifdef LOCAL_CONFIG_HAVE_LIBURING @@ -529,12 +553,12 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_iouring_ro(char *mem, size_t size) +static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, false); } -static void test_iouring_fork(char *mem, size_t size) +static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) { do_test_iouring(mem, size, true); } @@ -678,37 +702,41 @@ free_tmp: free(tmp); } -static void test_ro_pin_on_shared(char *mem, size_t size) +static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } -static void test_ro_fast_pin_on_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } -typedef void (*test_fn)(char *mem, size_t size); +typedef void (*test_fn)(char *mem, size_t size, bool hugetlb); static void do_run_with_base_page(test_fn fn, bool swapout) { @@ -740,7 +768,7 @@ static void do_run_with_base_page(test_fn fn, bool swapout) } } - fn(mem, pagesize); + fn(mem, pagesize, false); munmap: munmap(mem, pagesize); } @@ -904,7 +932,7 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) break; } - fn(mem, size); + fn(mem, size, false); munmap: munmap(mmap_mem, mmap_size); if (mremap_mem != MAP_FAILED) @@ -997,7 +1025,7 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) } munmap(dummy, hugetlbsize); - fn(mem, hugetlbsize); + fn(mem, hugetlbsize, true); munmap: munmap(mem, hugetlbsize); } @@ -1036,7 +1064,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child", - test_vmsplice_in_child + test_vmsplice_in_child, }, /* * vmsplice() test, but do an additional mprotect(PROT_READ)+ @@ -1044,7 +1072,7 @@ static const struct test_case anon_test_cases[] = { */ { "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect + test_vmsplice_in_child_mprotect, }, /* * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after @@ -1322,23 +1350,31 @@ close_comm_pipes: close_comm_pipes(&comm_pipes); } -static void test_anon_thp_collapse_unshared(char *mem, size_t size) +static void test_anon_thp_collapse_unshared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); } -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); } -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); } -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size, + bool is_hugetlb) { + assert(!is_hugetlb); do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); } diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index d7eaca5bbe9b..9423ad439a61 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -118,15 +118,22 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) return; } - /* - * Fault in the page writable such that GUP-fast can eventually pin - * it immediately. - */ + /* Fault in the page such that GUP-fast can pin it directly. */ memset(mem, 0, size); switch (type) { case TEST_TYPE_RO: case TEST_TYPE_RO_FAST: + /* + * Cover more cases regarding unsharing decisions when + * long-term R/O pinning by mapping the page R/O. + */ + ret = mprotect(mem, size, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + /* FALLTHROUGH */ case TEST_TYPE_RW: case TEST_TYPE_RW_FAST: { struct pin_longterm_test args; @@ -228,6 +235,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) assert(false); } +munmap: munmap(mem, size); } diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index d01e8d4901d0..8f122a0f0828 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -27,9 +27,9 @@ #include "vm_util.h" #include "../kselftest.h" -#define MMAP_SIZE (1 << 21) #define INLOOP_ITER 100 +size_t mmap_size; char *huge_ptr; /* Touch the memory while it is being madvised() */ @@ -44,7 +44,7 @@ void *touch(void *unused) void *madv(void *unused) { for (int i = 0; i < INLOOP_ITER; i++) - madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + madvise(huge_ptr, mmap_size, MADV_DONTNEED); return NULL; } @@ -59,7 +59,7 @@ void *map_extra(void *unused) void *ptr; for (int i = 0; i < INLOOP_ITER; i++) { - ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); @@ -93,14 +93,16 @@ int main(void) free_hugepages); } + mmap_size = default_huge_page_size(); + while (max--) { - huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + huge_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if ((unsigned long)huge_ptr == -1) { - ksft_exit_skip("Failed to allocated huge page\n"); - return KSFT_SKIP; + ksft_test_result_fail("Failed to allocate huge page\n"); + return KSFT_FAIL; } pthread_create(&thread1, NULL, madv, NULL); @@ -117,7 +119,7 @@ int main(void) } /* Unmap and restart */ - munmap(huge_ptr, MMAP_SIZE); + munmap(huge_ptr, mmap_size); } return KSFT_PASS; diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 508287560c45..37de82da9be7 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -28,6 +28,15 @@ #define MiB (1024 * KiB) #define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child" +#define MAP_MERGE_FAIL ((void *)-1) +#define MAP_MERGE_SKIP ((void *)-2) + +enum ksm_merge_mode { + KSM_MERGE_PRCTL, + KSM_MERGE_MADVISE, + KSM_MERGE_NONE, /* PRCTL already set */ +}; + static int mem_fd; static int ksm_fd; static int ksm_full_scans_fd; @@ -146,33 +155,34 @@ static int ksm_unmerge(void) return 0; } -static char *mmap_and_merge_range(char val, unsigned long size, int prot, - bool use_prctl) +static char *__mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) { char *map; + char *err_map = MAP_MERGE_FAIL; int ret; /* Stabilize accounting by disabling KSM completely. */ if (ksm_unmerge()) { - ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); - return MAP_FAILED; + ksft_print_msg("Disabling (unmerging) KSM failed\n"); + return err_map; } if (get_my_merging_pages() > 0) { - ksft_test_result_fail("Still pages merged\n"); - return MAP_FAILED; + ksft_print_msg("Still pages merged\n"); + return err_map; } map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; + ksft_print_msg("mmap() failed\n"); + return err_map; } /* Don't use THP. Ignore if THP are not around on a kernel. */ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + ksft_print_msg("MADV_NOHUGEPAGE failed\n"); goto unmap; } @@ -180,27 +190,36 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, memset(map, val, size); if (mprotect(map, size, prot)) { - ksft_test_result_skip("mprotect() failed\n"); + ksft_print_msg("mprotect() failed\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } - if (use_prctl) { + switch (mode) { + case KSM_MERGE_PRCTL: ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { - ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n"); + err_map = MAP_MERGE_SKIP; goto unmap; } else if (ret) { - ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n"); goto unmap; } - } else if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; + break; + case KSM_MERGE_MADVISE: + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_print_msg("MADV_MERGEABLE failed\n"); + goto unmap; + } + break; + case KSM_MERGE_NONE: + break; } /* Run KSM to trigger merging and wait. */ if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); + ksft_print_msg("Running KSM failed\n"); goto unmap; } @@ -209,14 +228,31 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, * accounted differently (depending on kernel support). */ if (val && !get_my_merging_pages()) { - ksft_test_result_fail("No pages got merged\n"); + ksft_print_msg("No pages got merged\n"); goto unmap; } return map; unmap: munmap(map, size); - return MAP_FAILED; + return err_map; +} + +static char *mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) +{ + char *map; + char *ret = MAP_FAILED; + + map = __mmap_and_merge_range(val, size, prot, mode); + if (map == MAP_MERGE_FAIL) + ksft_test_result_fail("Merging memory failed"); + else if (map == MAP_MERGE_SKIP) + ksft_test_result_skip("Merging memory skipped"); + else + ret = map; + + return ret; } static void test_unmerge(void) @@ -226,7 +262,7 @@ static void test_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -264,7 +300,7 @@ static void test_unmerge_zero_pages(void) } /* Let KSM deduplicate zero pages. */ - map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -312,7 +348,7 @@ static void test_unmerge_discarded(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -344,7 +380,7 @@ static void test_unmerge_uffd_wp(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) return; @@ -439,6 +475,36 @@ static void test_prctl(void) ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n"); } +static int test_child_ksm(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + /* Test if KSM is enabled for the process. */ + if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1) + return -1; + + /* Test if merge could really happen. */ + map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE); + if (map == MAP_MERGE_FAIL) + return -2; + else if (map == MAP_MERGE_SKIP) + return -3; + + munmap(map, size); + return 0; +} + +static void test_child_ksm_err(int status) +{ + if (status == -1) + ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + else if (status == -2) + ksft_test_result_fail("Merge in child failed\n"); + else if (status == -3) + ksft_test_result_skip("Merge in child skipped\n"); +} + /* Verify that prctl ksm flag is inherited. */ static void test_prctl_fork(void) { @@ -458,7 +524,7 @@ static void test_prctl_fork(void) child_pid = fork(); if (!child_pid) { - exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0)); + exit(test_child_ksm()); } else if (child_pid < 0) { ksft_test_result_fail("fork() failed\n"); return; @@ -467,8 +533,11 @@ static void test_prctl_fork(void) if (waitpid(child_pid, &status, 0) < 0) { ksft_test_result_fail("waitpid() failed\n"); return; - } else if (WEXITSTATUS(status) != 1) { - ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + } + + status = WEXITSTATUS(status); + if (status) { + test_child_ksm_err(status); return; } @@ -480,12 +549,6 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } -static int ksm_fork_exec_child(void) -{ - /* Test if KSM is enabled for the process. */ - return prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) == 1; -} - static void test_prctl_fork_exec(void) { int ret, status; @@ -518,7 +581,7 @@ static void test_prctl_fork_exec(void) if (WIFEXITED(status)) { status = WEXITSTATUS(status); if (status) { - ksft_test_result_fail("KSM not enabled\n"); + test_child_ksm_err(status); return; } } else { @@ -545,7 +608,7 @@ static void test_prctl_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL); if (map == MAP_FAILED) return; @@ -568,7 +631,7 @@ static void test_prot_none(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0x11, size, PROT_NONE, false); + map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE); if (map == MAP_FAILED) goto unmap; @@ -599,7 +662,7 @@ int main(int argc, char **argv) int err; if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { - exit(ksm_fork_exec_child() == 1 ? 0 : 1); + exit(test_child_ksm()); } #ifdef __NR_userfaultfd diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 9b298f6a04b3..9a0597310a76 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -20,6 +20,7 @@ #include <unistd.h> #include <errno.h> #include <stdio.h> +#include <fcntl.h> #include "../kselftest.h" @@ -83,6 +84,45 @@ static void test_mlock_limit(int fd) pass("mlock limit is respected\n"); } +static void test_vmsplice(int fd, const char *desc) +{ + ssize_t transferred; + struct iovec iov; + int pipefd[2]; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + goto close_pipe; + } + + /* + * vmsplice() may use GUP-fast, which must also fail. Prefault the + * page table, so GUP-fast could find it. + */ + memset(mem, PATTERN, page_size); + + iov.iov_base = mem; + iov.iov_len = page_size; + transferred = vmsplice(pipefd[1], &iov, 1, 0); + + if (transferred < 0 && errno == EFAULT) + pass("vmsplice is blocked as expected with %s\n", desc); + else + fail("vmsplice: unexpected memory access with %s\n", desc); + + munmap(mem, page_size); +close_pipe: + close(pipefd[0]); + close(pipefd[1]); +} + static void try_process_vm_read(int fd, int pipefd[2]) { struct iovec liov, riov; @@ -187,7 +227,6 @@ static void test_remote_access(int fd, const char *name, return; } - ftruncate(fd, page_size); memset(mem, PATTERN, page_size); if (write(pipefd[1], &mem, sizeof(mem)) < 0) { @@ -258,7 +297,7 @@ static void prepare(void) strerror(errno)); } -#define NUM_TESTS 4 +#define NUM_TESTS 6 int main(int argc, char *argv[]) { @@ -277,9 +316,17 @@ int main(int argc, char *argv[]) ksft_exit_fail_msg("memfd_secret failed: %s\n", strerror(errno)); } + if (ftruncate(fd, page_size)) + ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno)); test_mlock_limit(fd); test_file_apis(fd); + /* + * We have to run the first vmsplice test before any secretmem page was + * allocated for this fd. + */ + test_vmsplice(fd, "fresh page"); + test_vmsplice(fd, "existing page"); test_process_vm_read(fd); test_ptrace(fd); diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 26f744188ad0..7f0d50fa361d 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -20,8 +20,6 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) FILE *file; int ret = 1; char line[1024] = {0}; - char *end_addr; - char *stop; unsigned long start; unsigned long end; @@ -37,21 +35,10 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) memset(area, 0, sizeof(struct vm_boundaries)); while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { + if (sscanf(line, "%lx-%lx", &start, &end) != 2) { ksft_print_msg("cannot parse /proc/self/maps\n"); goto out; } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - ksft_print_msg("cannot parse /proc/self/maps\n"); - goto out; - } - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); if (start <= addr && end > addr) { area->start = start; diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 2f8b991f78cb..1b03bcfaefdf 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -23,6 +23,7 @@ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) #define SIZE_KB(k) ((size_t)k * 1024) @@ -69,6 +70,27 @@ enum { .expect_failure = should_fail \ } +/* compute square root using binary search */ +static unsigned long get_sqrt(unsigned long val) +{ + unsigned long low = 1; + + /* assuming rand_size is less than 1TB */ + unsigned long high = (1UL << 20); + + while (low <= high) { + unsigned long mid = low + (high - low) / 2; + unsigned long temp = mid * mid; + + if (temp == val) + return mid; + if (temp < val) + low = mid + 1; + high = mid - 1; + } + return low; +} + /* * Returns false if the requested remap region overlaps with an * existing mapping (e.g text, stack) else returns true. @@ -126,19 +148,21 @@ static unsigned long long get_mmap_min_addr(void) * Using /proc/self/maps, assert that the specified address range is contained * within a single mapping. */ -static bool is_range_mapped(FILE *maps_fp, void *start, void *end) +static bool is_range_mapped(FILE *maps_fp, unsigned long start, + unsigned long end) { char *line = NULL; size_t len = 0; bool success = false; + unsigned long first_val, second_val; rewind(maps_fp); while (getline(&line, &len, maps_fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); + if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) { + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + break; + } if (first_val <= start && second_val >= end) { success = true; @@ -233,7 +257,8 @@ static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -272,7 +297,8 @@ static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) goto out; } - success = is_range_mapped(maps_fp, start, start + 3 * page_size); + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); munmap(start, 3 * page_size); out: @@ -296,7 +322,7 @@ out: * * |DDDDddddSSSSssss| */ -static void mremap_move_within_range(char pattern_seed) +static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) { char *test_name = "mremap mremap move within range"; void *src, *dest; @@ -316,10 +342,7 @@ static void mremap_move_within_range(char pattern_seed) src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1)); /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); dest = src - SIZE_MB(2); @@ -357,14 +380,14 @@ out: /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) + char *rand_addr) { void *addr, *src_addr, *dest_addr, *dest_preamble_addr; - int d; - unsigned long long t; + unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; + unsigned long num_chunks; if (threshold_mb == VALIDATION_NO_THRESHOLD) threshold = c.region_size; @@ -378,9 +401,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) - memset((char *) src_addr + t, (char) rand(), 1); + memcpy(src_addr, rand_addr, threshold); /* Mask to zero out lower bits of address for alignment */ align_mask = ~(c.dest_alignment - 1); @@ -420,9 +441,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Set byte pattern for the dest preamble block. */ - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) - memset((char *) dest_preamble_addr + d, (char) rand(), 1); + memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size); } clock_gettime(CLOCK_MONOTONIC, &t_start); @@ -436,15 +455,42 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto clean_up_dest_preamble; } - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (t = 0; t < threshold; t++) { - char c = (char) rand(); + /* + * Verify byte pattern after remapping. Employ an algorithm with a + * square root time complexity in threshold: divide the range into + * chunks, if memcmp() returns non-zero, only then perform an + * iteration in that chunk to find the mismatch index. + */ + num_chunks = get_sqrt(threshold); + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = threshold / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) + continue; + + /* brute force iteration only over mismatch segment */ + for (t = shift; t < shift + chunk_size; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { + ksft_print_msg("Data after remap doesn't match at offset %llu\n", + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, + ((char *) dest_addr)[t] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } - if (((char *) dest_addr)[t] != c) { + /* + * if threshold is not divisible by num_chunks, then check the + * last chunk + */ + for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, ((char *) dest_addr)[t] & 0xff); ret = -1; goto clean_up_dest; @@ -452,22 +498,44 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } /* Verify the dest preamble byte pattern after remapping */ - if (c.dest_preamble_size) { - srand(pattern_seed); - for (d = 0; d < c.dest_preamble_size; d++) { - char c = (char) rand(); - - if (((char *) dest_preamble_addr)[d] != c) { - ksft_print_msg("Preamble data after remap doesn't match at offset %d\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); + if (!c.dest_preamble_size) + goto no_preamble; + + num_chunks = get_sqrt(c.dest_preamble_size); + + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = c.dest_preamble_size / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, + chunk_size)) + continue; + + /* brute force iteration only over mismatched segment */ + for (d = shift; d < shift + chunk_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); ret = -1; goto clean_up_dest; } } } + for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + +no_preamble: start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; @@ -494,7 +562,8 @@ out: * the beginning of the mapping just because the aligned * down address landed on a mapping that maybe does not exist. */ -static void mremap_move_1mb_from_start(char pattern_seed) +static void mremap_move_1mb_from_start(unsigned int pattern_seed, + char *rand_addr) { char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; void *src = NULL, *dest = NULL; @@ -520,10 +589,7 @@ static void mremap_move_1mb_from_start(char pattern_seed) } /* Set byte pattern for source block. */ - srand(pattern_seed); - for (i = 0; i < SIZE_MB(2); i++) { - ((char *)src)[i] = (char) rand(); - } + memcpy(src, rand_addr, SIZE_MB(2)); /* * Unmap the beginning of dest so that the aligned address @@ -568,10 +634,10 @@ out: static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, - unsigned int pattern_seed) + unsigned int pattern_seed, char *rand_addr) { long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); + rand_addr); if (remap_time < 0) { if (test_case.expect_failure) @@ -642,7 +708,15 @@ int main(int argc, char **argv) int failures = 0; int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + + /* hard-coded test configs */ + size_t max_test_variable_region_size = _2GB; + size_t max_test_constant_region_size = _2MB; + size_t dest_preamble_size = 10 * _4MB; + unsigned int pattern_seed; + char *rand_addr; + size_t rand_size; int num_expand_tests = 2; int num_misc_tests = 2; struct test test_cases[MAX_TEST] = {}; @@ -659,6 +733,31 @@ int main(int argc, char **argv) ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", threshold_mb, pattern_seed); + /* + * set preallocated random array according to test configs; see the + * functions for the logic of setting the size + */ + if (!threshold_mb) + rand_size = MAX(max_test_variable_region_size, + max_test_constant_region_size); + else + rand_size = MAX(MIN(threshold_mb * _1MB, + max_test_variable_region_size), + max_test_constant_region_size); + rand_size = MAX(dest_preamble_size, rand_size); + + rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (rand_addr == MAP_FAILED) { + perror("mmap"); + ksft_exit_fail_msg("cannot mmap rand_addr\n"); + } + + /* fill stream of random bytes */ + srand(pattern_seed); + for (unsigned long i = 0; i < rand_size; ++i) + rand_addr[i] = (char) rand(); + page_size = sysconf(_SC_PAGESIZE); /* Expected mremap failures */ @@ -730,13 +829,13 @@ int main(int argc, char **argv) for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); + pattern_seed, rand_addr); maps_fp = fopen("/proc/self/maps", "r"); if (maps_fp == NULL) { - ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); - exit(KSFT_FAIL); + munmap(rand_addr, rand_size); + ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); } mremap_expand_merge(maps_fp, page_size); @@ -744,17 +843,20 @@ int main(int argc, char **argv) fclose(maps_fp); - mremap_move_within_range(pattern_seed); - mremap_move_1mb_from_start(pattern_seed); + mremap_move_within_range(pattern_seed, rand_addr); + mremap_move_1mb_from_start(pattern_seed, rand_addr); if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); + threshold_mb, pattern_seed, + rand_addr); } + munmap(rand_addr, rand_size); + if (failures > 0) ksft_exit_fail(); else diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 4bdb3a0c7a60..3157204b9047 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -152,9 +152,13 @@ done < /proc/meminfo # both of these requirements into account and attempt to increase # number of huge pages available. nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) +uffd_min_KB=$((hpgsize_KB * nr_cpus * 2)) +hugetlb_min_KB=$((256 * 1024)) +if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then + needmem_KB=$uffd_min_KB +else + needmem_KB=$hugetlb_min_KB +fi # set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then @@ -294,7 +298,8 @@ CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 # Hugetlb tests require source and destination huge pages. Pass in half -# the size ($half_ufd_size_MB), which is used for *each*. +# the size of the free pages we have, which is used for *each*. +half_ufd_size_MB=$((freepgs / 2)) CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index d9dbf879748b..bdfa5d085f00 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -209,5 +209,5 @@ int main(int argc, char **argv) close(pagemap_fd); - ksft_exit_pass(); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 7bcf8d48256a..4e4c1e311247 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -12,6 +12,8 @@ #include <errno.h> #include <sys/mman.h> #include <sys/time.h> +#include <fcntl.h> + #include "../kselftest.h" /* @@ -85,7 +87,7 @@ static int validate_lower_address_hint(void) char *ptr; ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) return 0; @@ -93,6 +95,66 @@ static int validate_lower_address_hint(void) return 1; } +static int validate_complete_va_space(void) +{ + unsigned long start_addr, end_addr, prev_end_addr; + char line[400]; + char prot[6]; + FILE *file; + int fd; + + fd = open("va_dump", O_CREAT | O_WRONLY, 0600); + unlink("va_dump"); + if (fd < 0) { + ksft_test_result_skip("cannot create or open dump file\n"); + ksft_finished(); + } + + file = fopen("/proc/self/maps", "r"); + if (file == NULL) + ksft_exit_fail_msg("cannot open /proc/self/maps\n"); + + prev_end_addr = 0; + while (fgets(line, sizeof(line), file)) { + unsigned long hop; + + if (sscanf(line, "%lx-%lx %s[rwxp-]", + &start_addr, &end_addr, prot) != 3) + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + + /* end of userspace mappings; ignore vsyscall mapping */ + if (start_addr & (1UL << 63)) + return 0; + + /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ + if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) + return 1; + + prev_end_addr = end_addr; + + if (prot[0] != 'r') + continue; + + /* + * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. + * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 + * addresses after that. If the address was not held by this + * process, write would fail with errno set to EFAULT. + * Anyways, if write returns anything apart from 1, exit the + * program since that would mean a bug in /proc/self/maps. + */ + hop = 0; + while (start_addr + hop < end_addr) { + if (write(fd, (void *)(start_addr + hop), 1) != 1) + return 1; + lseek(fd, 0, SEEK_SET); + + hop += MAP_CHUNK_SIZE; + } + } + return 0; +} + int main(int argc, char *argv[]) { char *ptr[NR_CHUNKS_LOW]; @@ -105,13 +167,11 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_LOW; i++) { ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } + if (validate_lower_address_hint()) + ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); break; } @@ -127,7 +187,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NR_CHUNKS_HIGH; i++) { hint = hind_addr(); hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (hptr[i] == MAP_FAILED) break; @@ -135,6 +195,10 @@ int main(int argc, char *argv[]) validate_addr(hptr[i], 1); } hchunks = i; + if (validate_complete_va_space()) { + ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); + ksft_finished(); + } for (i = 0; i < lchunks; i++) munmap(ptr[i], MAP_CHUNK_SIZE); diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index 2aa66d2a1702..e6a3e04fd83f 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -478,10 +478,10 @@ v3exc_timeout_test() RET=0 local X=("192.0.2.20" "192.0.2.30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 v3exclude_prepare $h1 $ALL_MAC $ALL_GROUP ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ v3exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 -b $ALL_MAC -B $ALL_GROUP -t ip "proto=2,p=$MZPKT_ALLOW2" -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh index e2b9ff773c6b..f84ab2e65754 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mld.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -478,10 +478,10 @@ mldv2exc_timeout_test() RET=0 local X=("2001:db8:1::20" "2001:db8:1::30") - # GMI should be 3 seconds + # GMI should be 5 seconds ip link set dev br0 type bridge mcast_query_interval 100 \ mcast_query_response_interval 100 \ - mcast_membership_interval 300 + mcast_membership_interval 500 mldv2exclude_prepare $h1 ip link set dev br0 type bridge mcast_query_interval 500 \ @@ -489,7 +489,7 @@ mldv2exc_timeout_test() mcast_membership_interval 1500 $MZ $h1 -c 1 $MZPKT_ALLOW2 -q - sleep 3 + sleep 5 bridge -j -d -s mdb show dev br0 \ | jq -e ".[].mdb[] | \ select(.grp == \"$TEST_GROUP\" and \ diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c5b0cbc85b3e..4b364cdf3ef0 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -155,25 +155,30 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true - check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \ - "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name \ + "Multicast IPv4 to unknown group" \ + "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ + false check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ @@ -187,9 +192,10 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true - check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ - "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + xfail_on_veth $h1 \ + check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ + "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ + false check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index c868c0aec121..72b191e4e064 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -127,15 +127,17 @@ setup_ns() local ns="" local ns_name="" local ns_list="" + local ns_exist= for ns_name in "$@"; do # Some test may setup/remove same netns multi times if unset ${ns_name} 2> /dev/null; then ns="${ns_name,,}-$(mktemp -u XXXXXX)" eval readonly ${ns_name}="$ns" + ns_exist=false else eval ns='$'${ns_name} cleanup_ns "$ns" - + ns_exist=true fi if ! ip netns add "$ns"; then @@ -144,7 +146,7 @@ setup_ns() return $ksft_skip fi ip -n "$ns" link set lo up - ns_list="$ns_list $ns" + ! $ns_exist && ns_list="$ns_list $ns" done NS_LIST="$NS_LIST $ns_list" } diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 457789530645..3eeeeffb4005 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -293,7 +293,7 @@ check_random_order() local ns=$1 local log=$2 - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow @@ -306,7 +306,7 @@ check_random_order() done done - for i in $(seq 100); do + for i in $(seq 50); do ip -net $ns xfrm policy flush for j in $(seq 0 16 255 | sort -R); do local addr=$(printf "e000:0000:%02x00::/56" $j) diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index c376151982c4..b175e94e1901 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -7,12 +7,6 @@ ARCH := $(shell echo $(ARCH) | sed -e s/ppc.*/powerpc/) ifeq ($(ARCH),powerpc) -GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown") - -CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)/include $(CFLAGS) - -export CFLAGS - SUB_DIRS = alignment \ benchmarks \ cache_shape \ @@ -46,6 +40,7 @@ $(SUB_DIRS): BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all include ../lib.mk +include ./flags.mk override define RUN_TESTS +@for TARGET in $(SUB_DIRS); do \ @@ -57,14 +52,14 @@ endef override define INSTALL_RULE +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install;\ done; endef emit_tests: +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET $@;\ done; override define CLEAN diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93e9af37449d..66d5d7aaeb20 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index a32a6ab89914..1321922038d0 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,10 +4,11 @@ TEST_GEN_FILES := exec_target TEST_FILES := settings -CFLAGS += -O2 - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -O2 $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 689f6c8ebcd8..3a3ca956ac66 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := cache_shape top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 77594e697f2f..42940f92d832 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -1,14 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -m64 -CFLAGS += -I$(CURDIR) -CFLAGS += -D SELFTEST -CFLAGS += -maltivec -CFLAGS += -mcpu=power4 - -# Use our CFLAGS for the implicit .S rule & set the asm machine type -ASFLAGS = $(CFLAGS) -Wa,-mpower4 - TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ @@ -20,6 +10,17 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +# The loops are all 64-bit code +CFLAGS += -m64 +CFLAGS += -I$(CURDIR) +CFLAGS += -D SELFTEST +CFLAGS += -maltivec +CFLAGS += -mcpu=power4 + +# Use our CFLAGS for the implicit .S rule & set the asm machine type +ASFLAGS = $(CFLAGS) -Wa,-mpower4 $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) $(CC) $(CPPFLAGS) $(CFLAGS) \ diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index b82f45dd46b9..11eefb4b9fa4 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,2 +1,4 @@ +dexcr_test hashchk_test +chdexcr lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 76210f2bcec3..58cf9f722905 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,9 +1,12 @@ -TEST_GEN_PROGS := hashchk_test -TEST_GEN_FILES := lsdexcr +TEST_GEN_PROGS := dexcr_test hashchk_test +TEST_GEN_FILES := lsdexcr chdexcr include ../../lib.mk +include ../flags.mk -$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) +CFLAGS += $(KHDR_INCLUDES) + +$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c $(TEST_GEN_FILES): ../utils.c ./dexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c new file mode 100644 index 000000000000..c548d7a5bb9b --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <errno.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> + +#include "dexcr.h" +#include "utils.h" + +static void die(const char *msg) +{ + printf("%s\n", msg); + exit(1); +} + +static void help(void) +{ + printf("Invoke a provided program with a custom DEXCR on-exec reset value\n" + "\n" + "usage: chdexcr [CHDEXCR OPTIONS] -- PROGRAM [ARGS...]\n" + "\n" + "Each configurable DEXCR aspect is exposed as an option.\n" + "\n" + "The normal option sets the aspect in the DEXCR. The --no- variant\n" + "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" + "so indirect branch prediction will be disabled in the provided program.\n" + "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" + "prediction may occur.\n" + "\n" + "CHDEXCR OPTIONS:\n"); + + for (int i = 0; i < ARRAY_SIZE(aspects); i++) { + const struct dexcr_aspect *aspect = &aspects[i]; + + if (aspect->prctl == -1) + continue; + + printf(" --%-6s / --no-%-6s : %s\n", aspect->opt, aspect->opt, aspect->desc); + } +} + +static const struct dexcr_aspect *opt_to_aspect(const char *opt) +{ + for (int i = 0; i < ARRAY_SIZE(aspects); i++) + if (aspects[i].prctl != -1 && !strcmp(aspects[i].opt, opt)) + return &aspects[i]; + + return NULL; +} + +static int apply_option(const char *option) +{ + const struct dexcr_aspect *aspect; + const char *opt = NULL; + const char *set_prefix = "--"; + const char *clear_prefix = "--no-"; + unsigned long ctrl = 0; + int err; + + if (!strcmp(option, "-h") || !strcmp(option, "--help")) { + help(); + exit(0); + } + + /* Strip out --(no-) prefix and determine ctrl value */ + if (!strncmp(option, clear_prefix, strlen(clear_prefix))) { + opt = &option[strlen(clear_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + } else if (!strncmp(option, set_prefix, strlen(set_prefix))) { + opt = &option[strlen(set_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + } + + if (!opt || !*opt) + return 1; + + aspect = opt_to_aspect(opt); + if (!aspect) + die("unknown aspect"); + + err = pr_set_dexcr(aspect->prctl, ctrl); + if (err) + die("failed to apply option"); + + return 0; +} + +int main(int argc, char *const argv[]) +{ + int i; + + if (!dexcr_exists()) + die("DEXCR not detected on this hardware"); + + for (i = 1; i < argc; i++) + if (apply_option(argv[i])) + break; + + if (i < argc && !strcmp(argv[i], "--")) + i++; + + if (i >= argc) + die("missing command"); + + execvp(argv[i], &argv[i]); + perror("execve"); + + return errno; +} diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c b/tools/testing/selftests/powerpc/dexcr/dexcr.c index 65ec5347de98..468fd0dc9912 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c @@ -3,6 +3,7 @@ #include <errno.h> #include <setjmp.h> #include <signal.h> +#include <sys/prctl.h> #include <sys/types.h> #include <sys/wait.h> @@ -43,6 +44,45 @@ out: return exists; } +unsigned int pr_which_to_aspect(unsigned long which) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + return DEXCR_PR_SBHE; + case PR_PPC_DEXCR_IBRTPD: + return DEXCR_PR_IBRTPD; + case PR_PPC_DEXCR_SRAPD: + return DEXCR_PR_SRAPD; + case PR_PPC_DEXCR_NPHIE: + return DEXCR_PR_NPHIE; + default: + FAIL_IF_EXIT_MSG(true, "unknown PR aspect"); + } +} + +int pr_get_dexcr(unsigned long which) +{ + return prctl(PR_PPC_GET_DEXCR, which, 0UL, 0UL, 0UL); +} + +int pr_set_dexcr(unsigned long which, unsigned long ctrl) +{ + return prctl(PR_PPC_SET_DEXCR, which, ctrl, 0UL, 0UL); +} + +bool pr_dexcr_aspect_supported(unsigned long which) +{ + if (pr_get_dexcr(which) == -1) + return errno == ENODEV; + + return true; +} + +bool pr_dexcr_aspect_editable(unsigned long which) +{ + return pr_get_dexcr(which) & PR_PPC_DEXCR_CTRL_EDITABLE; +} + /* * Just test if a bad hashchk triggers a signal, without checking * for support or if the NPHIE aspect is enabled. diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index f55cbbc8643b..51e9ba3b0997 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -9,6 +9,7 @@ #define _SELFTESTS_POWERPC_DEXCR_DEXCR_H #include <stdbool.h> +#include <sys/prctl.h> #include <sys/types.h> #include "reg.h" @@ -26,8 +27,64 @@ #define PPC_RAW_HASHCHK(b, i, a) \ str(.long (0x7C0005E4 | PPC_RAW_HASH_ARGS(b, i, a));) +struct dexcr_aspect { + const char *name; /* Short display name */ + const char *opt; /* Option name for chdexcr */ + const char *desc; /* Expanded aspect meaning */ + unsigned int index; /* Aspect bit index in DEXCR */ + unsigned long prctl; /* 'which' value for get/set prctl */ +}; + +static const struct dexcr_aspect aspects[] = { + { + .name = "SBHE", + .opt = "sbhe", + .desc = "Speculative branch hint enable", + .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + }, + { + .name = "IBRTPD", + .opt = "ibrtpd", + .desc = "Indirect branch recurrent target prediction disable", + .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + }, + { + .name = "SRAPD", + .opt = "srapd", + .desc = "Subroutine return address prediction disable", + .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + }, + { + .name = "NPHIE", + .opt = "nphie", + .desc = "Non-privileged hash instruction enable", + .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + }, + { + .name = "PHIE", + .opt = "phie", + .desc = "Privileged hash instruction enable", + .index = 6, + .prctl = -1, + }, +}; + bool dexcr_exists(void); +bool pr_dexcr_aspect_supported(unsigned long which); + +bool pr_dexcr_aspect_editable(unsigned long which); + +int pr_get_dexcr(unsigned long pr_aspect); + +int pr_set_dexcr(unsigned long pr_aspect, unsigned long ctrl); + +unsigned int pr_which_to_aspect(unsigned long which); + bool hashchk_triggers(void); enum dexcr_source { diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr_test.c b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c new file mode 100644 index 000000000000..7a8657164908 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "dexcr.h" +#include "utils.h" + +/* + * Helper function for testing the behaviour of a newly exec-ed process + */ +static int dexcr_prctl_onexec_test_child(unsigned long which, const char *status) +{ + unsigned long dexcr = mfspr(SPRN_DEXCR_RO); + unsigned long aspect = pr_which_to_aspect(which); + int ctrl = pr_get_dexcr(which); + + if (!strcmp(status, "set")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), + "setting aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(!(aspect & dexcr), "setting aspect across exec did not take effect"); + } else if (!strcmp(status, "clear")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), + "clearing aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(aspect & dexcr, "clearing aspect across exec did not take effect"); + } else { + FAIL_IF_EXIT_MSG(true, "unknown expected status"); + } + + return 0; +} + +/* + * Test that the given prctl value can be manipulated freely + */ +static int dexcr_prctl_aspect_test(unsigned long which) +{ + unsigned long aspect = pr_which_to_aspect(which); + pid_t pid; + int ctrl; + int err; + int errno_save; + + SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_supported(which), "DEXCR aspect not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_editable(which), "DEXCR aspect not editable with prctl"); + + /* We reject invalid combinations of arguments */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear should be rejected with EINVAL"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear on exec should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear on exec should be rejected with EINVAL"); + + /* We set the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR, "config value unexpected clear flag"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "setting aspect did not take effect"); + + /* We clear the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET, "config value unexpected set flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "clearing aspect did not take effect"); + + /* We make it set on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC, "config value unexpected clear on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "scheduling aspect to set on exec should not change it now"); + + /* We make it clear on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect config should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC, "config value unexpected set on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should still be cleared"); + + /* We allow setting the current and on-exec value in a single call */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "process aspect should be set"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should be clear"); + + /* Verify the onexec value is applied across exec */ + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "set", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(mfspr(SPRN_DEXCR_RO) & aspect, + "setting aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "clear", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(!(mfspr(SPRN_DEXCR_RO) & aspect), + "clearing aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + return 0; +} + +static int dexcr_prctl_ibrtpd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_IBRTPD); +} + +static int dexcr_prctl_srapd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_SRAPD); +} + +static int dexcr_prctl_nphie_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_NPHIE); +} + +int main(int argc, char *argv[]) +{ + int err = 0; + + /* + * Some tests require checking what happens across exec, so we may be + * invoked as the child of a particular test + */ + if (argc > 1) { + if (argc == 3 && !strcmp(argv[0], "dexcr_prctl_onexec_test_child")) { + unsigned long which; + + err = parse_ulong(argv[1], strlen(argv[1]), &which, 10); + FAIL_IF_MSG(err, "failed to parse which value for child"); + + return dexcr_prctl_onexec_test_child(which, argv[2]); + } + + FAIL_IF_MSG(true, "unknown test case"); + } + + /* + * Otherwise we are the main test invocation and run the full suite + */ + err |= test_harness(dexcr_prctl_ibrtpd_test, "dexcr_prctl_ibrtpd"); + err |= test_harness(dexcr_prctl_srapd_test, "dexcr_prctl_srapd"); + err |= test_harness(dexcr_prctl_nphie_test, "dexcr_prctl_nphie"); + + return err; +} diff --git a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c index 7d5658c9ebe4..645224bdc142 100644 --- a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c +++ b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c @@ -21,8 +21,14 @@ static int require_nphie(void) { SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + + pr_set_dexcr(PR_PPC_DEXCR_NPHIE, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + + if (get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE) + return 0; + SKIP_IF_MSG(!(get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE), - "DEXCR[NPHIE] not enabled"); + "Failed to enable DEXCR[NPHIE]"); return 0; } diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index 94abbfcc389e..7588929180ab 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0+ -#include <errno.h> #include <stddef.h> #include <stdio.h> #include <string.h> +#include <sys/prctl.h> #include "dexcr.h" #include "utils.h" @@ -12,40 +12,6 @@ static unsigned int dexcr; static unsigned int hdexcr; static unsigned int effective; -struct dexcr_aspect { - const char *name; - const char *desc; - unsigned int index; -}; - -static const struct dexcr_aspect aspects[] = { - { - .name = "SBHE", - .desc = "Speculative branch hint enable", - .index = 0, - }, - { - .name = "IBRTPD", - .desc = "Indirect branch recurrent target prediction disable", - .index = 3, - }, - { - .name = "SRAPD", - .desc = "Subroutine return address prediction disable", - .index = 4, - }, - { - .name = "NPHIE", - .desc = "Non-privileged hash instruction enable", - .index = 5, - }, - { - .name = "PHIE", - .desc = "Privileged hash instruction enable", - .index = 6, - }, -}; - static void print_list(const char *list[], size_t len) { for (size_t i = 0; i < len; i++) { @@ -60,7 +26,7 @@ static void print_dexcr(char *name, unsigned int bits) const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL}; size_t j = 0; - printf("%s: %08x", name, bits); + printf("%s: 0x%08x", name, bits); if (bits == 0) { printf("\n"); @@ -103,6 +69,63 @@ static void print_aspect(const struct dexcr_aspect *aspect) printf(" \t(%s)\n", aspect->desc); } +static void print_aspect_config(const struct dexcr_aspect *aspect) +{ + const char *reason = NULL; + const char *reason_hyp = NULL; + const char *reason_prctl = "no prctl"; + bool actual = effective & DEXCR_PR_BIT(aspect->index); + bool expected = actual; /* Assume it's fine if we don't expect a specific set/clear value */ + + if (actual) + reason = "set by unknown"; + else + reason = "cleared by unknown"; + + if (aspect->prctl != -1) { + int ctrl = pr_get_dexcr(aspect->prctl); + + if (ctrl < 0) { + reason_prctl = "failed to read prctl"; + } else { + if (ctrl & PR_PPC_DEXCR_CTRL_SET) { + reason_prctl = "set by prctl"; + expected = true; + } else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + reason_prctl = "cleared by prctl"; + expected = false; + } else { + reason_prctl = "unknown prctl"; + } + + reason = reason_prctl; + } + } + + if (hdexcr & DEXCR_PR_BIT(aspect->index)) { + reason_hyp = "set by hypervisor"; + reason = reason_hyp; + expected = true; + } else { + reason_hyp = "not modified by hypervisor"; + } + + printf("%12s (%d): %-28s (%s, %s)\n", + aspect->name, + aspect->index, + reason, + reason_hyp, + reason_prctl); + + /* + * The checks are not atomic, so this can technically trigger if the + * hypervisor makes a change while we are checking each source. It's + * far more likely to be a bug if we see this though. + */ + if (actual != expected) + printf(" : ! actual %s does not match config\n", aspect->name); +} + int main(int argc, char *argv[]) { if (!dexcr_exists()) { @@ -114,6 +137,8 @@ int main(int argc, char *argv[]) hdexcr = get_dexcr(HDEXCR); effective = dexcr | hdexcr; + printf("current status:\n"); + print_dexcr(" DEXCR", dexcr); print_dexcr(" HDEXCR", hdexcr); print_dexcr("Effective", effective); @@ -136,6 +161,12 @@ int main(int argc, char *argv[]) else printf("ignored\n"); } + printf("\n"); + + printf("configuration:\n"); + for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) + print_aspect_config(&aspects[i]); + printf("\n"); return 0; } diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 9289d5febe1e..9fa9cb5bd989 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -5,6 +5,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread $(OUTPUT)/dscr_explicit_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile index ae963eb2dc5b..70797716f2b5 100644 --- a/tools/testing/selftests/powerpc/eeh/Makefile +++ b/tools/testing/selftests/powerpc/eeh/Makefile @@ -7,3 +7,4 @@ TEST_FILES := eeh-functions.sh settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk diff --git a/tools/testing/selftests/powerpc/flags.mk b/tools/testing/selftests/powerpc/flags.mk new file mode 100644 index 000000000000..b909bee3cb2a --- /dev/null +++ b/tools/testing/selftests/powerpc/flags.mk @@ -0,0 +1,12 @@ +#This checks for any ENV variables and add those. + +ifeq ($(GIT_VERSION),) +GIT_VERSION := $(shell git describe --always --long --dirty || echo "unknown") +export GIT_VERSION +endif + +ifeq ($(CFLAGS),) +CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(CFLAGS) +export CFLAGS +endif + diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 3948f7c510aa..b14fd2e0c6a8 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal fpu_denormal vmx_syscall vm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c $(TEST_GEN_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile index 2424513982d9..ce4ed679aaaf 100644 --- a/tools/testing/selftests/powerpc/mce/Makefile +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := inject-ra-err include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 4a6608beef0e..aab058ecb352 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -13,6 +13,7 @@ TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index 0785c2e99d40..480d8ba94cf7 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -1,8 +1,9 @@ -CFLAGS = -O3 -m64 -I./include -I../include - TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +include ../flags.mk + +CFLAGS = -O3 -m64 -I./include -I../include $(TEST_GEN_FILES): gzip_vas.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_attributes/Makefile b/tools/testing/selftests/powerpc/papr_attributes/Makefile index e899712d49db..406429499022 100644 --- a/tools/testing/selftests/powerpc/papr_attributes/Makefile +++ b/tools/testing/selftests/powerpc/papr_attributes/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := attr_test top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -$(TEST_GEN_PROGS): ../harness.c ../utils.c
\ No newline at end of file +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile b/tools/testing/selftests/powerpc/papr_sysparm/Makefile index 7f79e437634a..fed4f2414dbf 100644 --- a/tools/testing/selftests/powerpc/papr_sysparm/Makefile +++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_sysparm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile b/tools/testing/selftests/powerpc/papr_vpd/Makefile index 06b719703bfd..b09852e40882 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/Makefile +++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_vpd top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index a284fa874a9f..7e9dbf3d0d09 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -7,8 +7,11 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests +SUB_DIRS := ebb sampling_tests event_code_tests + +all: $(TEST_GEN_PROGS) $(SUB_DIRS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -22,12 +25,16 @@ $(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) $(OUTPUT)/per_event_excludes: ../utils.c +$(SUB_DIRS): + BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all + DEFAULT_RUN_TESTS := $(RUN_TESTS) override define RUN_TESTS $(DEFAULT_RUN_TESTS) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests; \ + done; endef emit_tests: @@ -35,34 +42,29 @@ emit_tests: BASENAME_TEST=`basename $$TEST`; \ echo "$(COLLECTION):$$BASENAME_TEST"; \ done - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET emit_tests; \ + done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install; \ + done; endef DEFAULT_CLEAN := $(CLEAN) override define CLEAN $(DEFAULT_CLEAN) $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \ + done; endef -ebb: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -sampling_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -event_code_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all .PHONY: all run_tests ebb sampling_tests event_code_tests emit_tests diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index 010160690227..1b39af7c10db 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -4,16 +4,6 @@ include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ -# The EBB handler is 64-bit code and everything links against it -CFLAGS += -m64 - -TMPOUT = $(OUTPUT)/TMPDIR/ -# Toolchains may build PIE by default which breaks the assembly -no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) - -LDFLAGS += $(no-pie-option) - TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ cycles_with_freeze_test pmc56_overflow_test \ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \ @@ -28,6 +18,17 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +# The EBB handler is 64-bit code and everything links against it +CFLAGS += -m64 + +TMPOUT = $(OUTPUT)/TMPDIR/ +# Toolchains may build PIE by default which breaks the assembly +no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) + +LDFLAGS += $(no-pie-option) $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \ ebb.c ebb_handler.S trace.c busy_loop.S diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 4e07d7046457..fdb080b3fa65 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ @@ -11,5 +9,8 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +CFLAGS += -m64 $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ../sampling_tests/misc.h ../sampling_tests/misc.c diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 9e67351fb252..9f79bec5fce7 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ @@ -11,5 +9,8 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk + +CFLAGS += -m64 $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S ../branch_loops.S diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 9b9491a63213..23bd9a7590dd 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -I$(CURDIR) - TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -I$(CURDIR) $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 1b39b86849da..59ca01d8567e 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -26,6 +26,7 @@ LOCAL_HDRS += $(patsubst %,$(selfdir)/powerpc/ptrace/%,$(wildcard *.h)) top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk TM_TESTS := $(patsubst %,$(OUTPUT)/%,$(TM_TESTS)) TESTS_64 := $(patsubst %,$(OUTPUT)/%,$(TESTS_64)) diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index e0d979ab0204..33286039724a 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -5,9 +5,10 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. -CFLAGS += $(KHDR_INCLUDES) - include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index f679d260afc8..ece95bd52be9 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -3,7 +3,6 @@ TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart TEST_GEN_PROGS += sigreturn_kernel TEST_GEN_PROGS += sigreturn_unaligned -CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm $(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 @@ -11,5 +10,8 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -maltivec $(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 9c39f55a58ff..4c9d9a58c9d1 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -I$(CURDIR) - EXTRA_SOURCES := ../harness.c build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi) @@ -27,9 +24,13 @@ $(OUTPUT)/strlen_32: CFLAGS += -m32 TEST_GEN_PROGS += strlen_32 endif -ASFLAGS = $(CFLAGS) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +# The loops are all 64-bit code +CFLAGS += -I$(CURDIR) + +ASFLAGS = $(CFLAGS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index bdc081afedb0..0da2e0a74264 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := switch_endian_test -ASFLAGS += -O2 -Wall -g -nostdlib -m64 - EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +ASFLAGS += -O2 -Wall -g -nostdlib -m64 $(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index ee1740ddfb0c..3bc07af88f0e 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := ipc_unmuxed rtas_filter -CFLAGS += $(KHDR_INCLUDES) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += $(KHDR_INCLUDES) $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 3876805c2f31..f13f0ab36007 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -11,6 +11,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index cf65cbf33085..61d519a076c6 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,10 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 -I$(CURDIR) - top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk + +CFLAGS += -m64 -I$(CURDIR) $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/ring-buffer/.gitignore b/tools/testing/selftests/ring-buffer/.gitignore new file mode 100644 index 000000000000..3aed1a2a6c67 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/.gitignore @@ -0,0 +1 @@ +map_test diff --git a/tools/testing/selftests/ring-buffer/Makefile b/tools/testing/selftests/ring-buffer/Makefile new file mode 100644 index 000000000000..627c5fa6d1ab --- /dev/null +++ b/tools/testing/selftests/ring-buffer/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -D_GNU_SOURCE + +TEST_GEN_PROGS = map_test + +include ../lib.mk diff --git a/tools/testing/selftests/ring-buffer/config b/tools/testing/selftests/ring-buffer/config new file mode 100644 index 000000000000..d936f8f00e78 --- /dev/null +++ b/tools/testing/selftests/ring-buffer/config @@ -0,0 +1,2 @@ +CONFIG_FTRACE=y +CONFIG_TRACER_SNAPSHOT=y diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c new file mode 100644 index 000000000000..a9006fa7097e --- /dev/null +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ring-buffer memory mapping tests + * + * Copyright (c) 2024 Vincent Donnefort <vdonnefort@google.com> + */ +#include <fcntl.h> +#include <sched.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include <linux/trace_mmap.h> + +#include <sys/mman.h> +#include <sys/ioctl.h> + +#include "../user_events/user_events_selftests.h" /* share tracefs setup */ +#include "../kselftest_harness.h" + +#define TRACEFS_ROOT "/sys/kernel/tracing" + +static int __tracefs_write(const char *path, const char *value) +{ + int fd, ret; + + fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + return fd; + + ret = write(fd, value, strlen(value)); + + close(fd); + + return ret == -1 ? -errno : 0; +} + +static int __tracefs_write_int(const char *path, int value) +{ + char *str; + int ret; + + if (asprintf(&str, "%d", value) < 0) + return -1; + + ret = __tracefs_write(path, str); + + free(str); + + return ret; +} + +#define tracefs_write_int(path, value) \ + ASSERT_EQ(__tracefs_write_int((path), (value)), 0) + +#define tracefs_write(path, value) \ + ASSERT_EQ(__tracefs_write((path), (value)), 0) + +static int tracefs_reset(void) +{ + if (__tracefs_write_int(TRACEFS_ROOT"/tracing_on", 0)) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/trace", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/set_event", "")) + return -1; + if (__tracefs_write(TRACEFS_ROOT"/current_tracer", "nop")) + return -1; + + return 0; +} + +struct tracefs_cpu_map_desc { + struct trace_buffer_meta *meta; + int cpu_fd; +}; + +int tracefs_cpu_map(struct tracefs_cpu_map_desc *desc, int cpu) +{ + int page_size = getpagesize(); + char *cpu_path; + void *map; + + if (asprintf(&cpu_path, + TRACEFS_ROOT"/per_cpu/cpu%d/trace_pipe_raw", + cpu) < 0) + return -ENOMEM; + + desc->cpu_fd = open(cpu_path, O_RDONLY | O_NONBLOCK); + free(cpu_path); + if (desc->cpu_fd < 0) + return -ENODEV; + + map = mmap(NULL, page_size, PROT_READ, MAP_SHARED, desc->cpu_fd, 0); + if (map == MAP_FAILED) + return -errno; + + desc->meta = (struct trace_buffer_meta *)map; + + return 0; +} + +void tracefs_cpu_unmap(struct tracefs_cpu_map_desc *desc) +{ + munmap(desc->meta, desc->meta->meta_page_size); + close(desc->cpu_fd); +} + +FIXTURE(map) { + struct tracefs_cpu_map_desc map_desc; + bool umount; +}; + +FIXTURE_VARIANT(map) { + int subbuf_size; +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_4k) { + .subbuf_size = 4, +}; + +FIXTURE_VARIANT_ADD(map, subbuf_size_8k) { + .subbuf_size = 8, +}; + +FIXTURE_SETUP(map) +{ + int cpu = sched_getcpu(); + cpu_set_t cpu_mask; + bool fail, umount; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; + + ASSERT_GE(cpu, 0); + + ASSERT_EQ(tracefs_reset(), 0); + + tracefs_write_int(TRACEFS_ROOT"/buffer_subbuf_size_kb", variant->subbuf_size); + + ASSERT_EQ(tracefs_cpu_map(&self->map_desc, cpu), 0); + + /* + * Ensure generated events will be found on this very same ring-buffer. + */ + CPU_ZERO(&cpu_mask); + CPU_SET(cpu, &cpu_mask); + ASSERT_EQ(sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask), 0); +} + +FIXTURE_TEARDOWN(map) +{ + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); + + tracefs_cpu_unmap(&self->map_desc); +} + +TEST_F(map, meta_page_check) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + int cnt = 0; + + ASSERT_EQ(desc->meta->entries, 0); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 0); + + ASSERT_EQ(desc->meta->reader.id, 0); + ASSERT_EQ(desc->meta->reader.read, 0); + + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + ASSERT_EQ(desc->meta->reader.id, 0); + + tracefs_write_int(TRACEFS_ROOT"/tracing_on", 1); + for (int i = 0; i < 16; i++) + tracefs_write_int(TRACEFS_ROOT"/trace_marker", i); +again: + ASSERT_EQ(ioctl(desc->cpu_fd, TRACE_MMAP_IOCTL_GET_READER), 0); + + ASSERT_EQ(desc->meta->entries, 16); + ASSERT_EQ(desc->meta->overrun, 0); + ASSERT_EQ(desc->meta->read, 16); + + ASSERT_EQ(desc->meta->reader.id, 1); + + if (!(cnt++)) + goto again; +} + +TEST_F(map, data_mmap) +{ + struct tracefs_cpu_map_desc *desc = &self->map_desc; + unsigned long meta_len, data_len; + void *data; + + meta_len = desc->meta->meta_page_size; + data_len = desc->meta->subbuf_size * desc->meta->nr_subbufs; + + /* Map all the available subbufs */ + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Map all the available subbufs - 1 */ + data_len -= desc->meta->subbuf_size; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_NE(data, MAP_FAILED); + munmap(data, data_len); + + /* Overflow the available subbufs by 1 */ + meta_len += desc->meta->subbuf_size * 2; + data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, + desc->cpu_fd, meta_len); + ASSERT_EQ(data, MAP_FAILED); +} + +FIXTURE(snapshot) { + bool umount; +}; + +FIXTURE_SETUP(snapshot) +{ + bool fail, umount; + struct stat sb; + char *message; + + if (getuid() != 0) + SKIP(return, "Skipping: %s", "Please run the test as root"); + + if (stat(TRACEFS_ROOT"/snapshot", &sb)) + SKIP(return, "Skipping: %s", "snapshot not available"); + + if (!tracefs_enabled(&message, &fail, &umount)) { + if (fail) { + TH_LOG("Tracefs setup failed: %s", message); + ASSERT_FALSE(fail); + } + SKIP(return, "Skipping: %s", message); + } + + self->umount = umount; +} + +FIXTURE_TEARDOWN(snapshot) +{ + __tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "!snapshot"); + tracefs_reset(); + + if (self->umount) + tracefs_unmount(); +} + +TEST_F(snapshot, excludes_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_GE(cpu, 0); + tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"); + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), -EBUSY); +} + +TEST_F(snapshot, excluded_by_map) +{ + struct tracefs_cpu_map_desc map_desc; + int cpu = sched_getcpu(); + + ASSERT_EQ(tracefs_cpu_map(&map_desc, cpu), 0); + + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/events/sched/sched_switch/trigger", + "snapshot"), -EBUSY); + ASSERT_EQ(__tracefs_write(TRACEFS_ROOT"/snapshot", + "1"), -EBUSY); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index dcd7509fe2e0..0bb46793dcd4 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -261,6 +261,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Register without separator spacing should still match */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;u32 field2"; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ reg.enable_bit = 29; reg.name_args = (__u64)"__test_event u32 field1;"; @@ -288,6 +294,8 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); + unreg.disable_bit = 29; + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); /* Delete should have been auto-done after close and unregister */ close(self->data_fd); diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c index 757e6527f67e..ee909a7927f9 100644 --- a/tools/testing/selftests/x86/test_shadow_stack.c +++ b/tools/testing/selftests/x86/test_shadow_stack.c @@ -556,7 +556,7 @@ struct node { * looked at the shadow stack gaps. * 5. See if it landed in the gap. */ -int test_guard_gap(void) +int test_guard_gap_other_gaps(void) { void *free_area, *shstk, *test_map = (void *)0xFFFFFFFFFFFFFFFF; struct node *head = NULL, *cur; @@ -593,11 +593,64 @@ int test_guard_gap(void) if (shstk - test_map - PAGE_SIZE != PAGE_SIZE) return 1; - printf("[OK]\tGuard gap test\n"); + printf("[OK]\tGuard gap test, other mapping's gaps\n"); return 0; } +/* Tests respecting the guard gap of the mapping getting placed */ +int test_guard_gap_new_mappings_gaps(void) +{ + void *free_area, *shstk_start, *test_map = (void *)0xFFFFFFFFFFFFFFFF; + struct node *head = NULL, *cur; + int ret = 0; + + free_area = mmap(0, PAGE_SIZE * 4, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(free_area, PAGE_SIZE * 4); + + /* Test letting map_shadow_stack find a free space */ + shstk_start = mmap(free_area, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (shstk_start == MAP_FAILED || shstk_start != free_area) + return 1; + + while (test_map > shstk_start) { + test_map = (void *)syscall(__NR_map_shadow_stack, 0, PAGE_SIZE, 0); + if (test_map == MAP_FAILED) { + printf("[INFO]\tmap_shadow_stack MAP_FAILED\n"); + ret = 1; + break; + } + + cur = malloc(sizeof(*cur)); + cur->mapping = test_map; + + cur->next = head; + head = cur; + + if (test_map == free_area + PAGE_SIZE) { + printf("[INFO]\tNew mapping has other mapping in guard gap!\n"); + ret = 1; + break; + } + } + + while (head) { + cur = head; + head = cur->next; + munmap(cur->mapping, PAGE_SIZE); + free(cur); + } + + munmap(shstk_start, PAGE_SIZE); + + if (!ret) + printf("[OK]\tGuard gap test, placement mapping's gaps\n"); + + return ret; +} + /* * Too complicated to pull it out of the 32 bit header, but also get the * 64 bit one needed above. Just define a copy here. @@ -850,9 +903,15 @@ int main(int argc, char *argv[]) goto out; } - if (test_guard_gap()) { + if (test_guard_gap_other_gaps()) { + ret = 1; + printf("[FAIL]\tGuard gap test, other mappings' gaps\n"); + goto out; + } + + if (test_guard_gap_new_mappings_gaps()) { ret = 1; - printf("[FAIL]\tGuard gap test\n"); + printf("[FAIL]\tGuard gap test, placement mapping's gaps\n"); goto out; } diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config index 6d4ba77847b6..0b7ecfb30d19 100644 --- a/tools/tracing/rtla/Makefile.config +++ b/tools/tracing/rtla/Makefile.config @@ -3,7 +3,7 @@ STOP_ERROR := LIBTRACEEVENT_MIN_VERSION = 1.5 -LIBTRACEFS_MIN_VERSION = 1.3 +LIBTRACEFS_MIN_VERSION = 1.6 define lib_setup $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)")) diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 01870d50942a..7be17d09f7e8 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -36,13 +36,14 @@ struct osnoise_hist_params { cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; - char no_header; char no_summary; char no_index; char with_zeros; int bucket_size; int entries; + int warmup; + int buffer_size; }; struct osnoise_hist_cpu { @@ -436,9 +437,9 @@ static void osnoise_hist_usage(char *usage) static const char * const msg[] = { "", " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", + " [-T us] [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [-C[=cgroup_name]]", + " [--no-index] [--with-zeros] [-C[=cgroup_name]] [--warm-up]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -452,7 +453,7 @@ static void osnoise_hist_usage(char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -468,6 +469,8 @@ static void osnoise_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -531,13 +534,15 @@ static struct osnoise_hist_params {"with-zeros", no_argument, 0, '3'}, {"trigger", required_argument, 0, '4'}, {"filter", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -640,9 +645,13 @@ static struct osnoise_hist_params params->threshold = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '0') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; @@ -680,6 +689,12 @@ static struct osnoise_hist_params osnoise_hist_usage("--filter requires a previous -e\n"); } break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_hist_usage("Invalid option"); } @@ -886,6 +901,11 @@ int osnoise_hist_main(int argc, char *argv[]) goto out_hist; } + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } /* @@ -899,6 +919,25 @@ int osnoise_hist_main(int argc, char *argv[]) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_hist; + } + + } + tool->start_time = time(NULL); osnoise_hist_set_signals(params); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 457360db0767..07ba55d4ec06 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -40,6 +40,8 @@ struct osnoise_top_params { int set_sched; int cgroup; int hk_cpus; + int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -281,8 +283,8 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) static const char * const msg[] = { " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", - " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]]", + " [-T us] [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", + " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -296,7 +298,7 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -307,6 +309,8 @@ static void osnoise_top_usage(struct osnoise_top_params *params, char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -381,13 +385,15 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) {"trace", optional_argument, 0, 't'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, + {"warm-up", required_argument, 0, '2'}, + {"trace-buffer-size", required_argument, 0, '3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", long_options, &option_index); /* Detect the end of the options. */ @@ -480,9 +486,13 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "osnoise_trace.txt"; break; @@ -511,6 +521,12 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) osnoise_top_usage(params, "--filter requires a previous -e\n"); } break; + case '2': + params->warmup = get_llong_from_str(optarg); + break; + case '3': + params->buffer_size = get_llong_from_str(optarg); + break; default: osnoise_top_usage(params, "Invalid option"); } @@ -719,6 +735,12 @@ int osnoise_top_main(int argc, char **argv) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } /* @@ -732,6 +754,25 @@ int osnoise_top_main(int argc, char **argv) trace_instance_start(&record->trace); trace_instance_start(trace); + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_top; + + /* + * Clean up the buffer. The osnoise workload do not run + * with tracing off to avoid creating a performance penalty + * when not needed. + */ + retval = tracefs_instance_file_write(trace->inst, "trace", ""); + if (retval < 0) { + debug_msg("Error cleaning up the buffer"); + goto out_top; + } + + } + tool->start_time = time(NULL); osnoise_top_set_signals(params); diff --git a/tools/tracing/rtla/src/timerlat_aa.c b/tools/tracing/rtla/src/timerlat_aa.c index 7093fd5333be..7bd80ee2a5b4 100644 --- a/tools/tracing/rtla/src/timerlat_aa.c +++ b/tools/tracing/rtla/src/timerlat_aa.c @@ -16,6 +16,9 @@ enum timelat_state { TIMERLAT_WAITING_THREAD, }; +/* Used to fill spaces in the output */ +static const char *spaces = " "; + #define MAX_COMM 24 /* @@ -274,14 +277,17 @@ static int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *recor taa_data->prev_irq_timstamp = start; trace_seq_reset(taa_data->prev_irqs_seq); - trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s \t\t\t%9.2f us\n", - "nmi", ns_to_usf(duration)); + trace_seq_printf(taa_data->prev_irqs_seq, " %24s %.*s %9.2f us\n", + "nmi", + 24, spaces, + ns_to_usf(duration)); return 0; } taa_data->thread_nmi_sum += duration; - trace_seq_printf(taa_data->nmi_seq, " %24s \t\t\t%9.2f us\n", - "nmi", ns_to_usf(duration)); + trace_seq_printf(taa_data->nmi_seq, " %24s %.*s %9.2f us\n", + "nmi", + 24, spaces, ns_to_usf(duration)); return 0; } @@ -323,8 +329,10 @@ static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *recor taa_data->prev_irq_timstamp = start; trace_seq_reset(taa_data->prev_irqs_seq); - trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s:%-3llu \t\t%9.2f us\n", - desc, vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->prev_irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + desc, vector, + 15, spaces, + ns_to_usf(duration)); return 0; } @@ -372,8 +380,10 @@ static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *recor * IRQ interference. */ taa_data->thread_irq_sum += duration; - trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu \t %9.2f us\n", - desc, vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->irqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + desc, vector, + 24, spaces, + ns_to_usf(duration)); return 0; } @@ -408,8 +418,10 @@ static int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *r taa_data->thread_softirq_sum += duration; - trace_seq_printf(taa_data->softirqs_seq, "\t%24s:%-3llu \t %9.2f us\n", - softirq_name[vector], vector, ns_to_usf(duration)); + trace_seq_printf(taa_data->softirqs_seq, " %24s:%-3llu %.*s %9.2f us\n", + softirq_name[vector], vector, + 24, spaces, + ns_to_usf(duration)); return 0; } @@ -452,8 +464,10 @@ static int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *re } else { taa_data->thread_thread_sum += duration; - trace_seq_printf(taa_data->threads_seq, "\t%24s:%-3llu \t\t%9.2f us\n", - comm, pid, ns_to_usf(duration)); + trace_seq_printf(taa_data->threads_seq, " %24s:%-12llu %.*s %9.2f us\n", + comm, pid, + 15, spaces, + ns_to_usf(duration)); } return 0; @@ -482,7 +496,8 @@ static int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *rec function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]); if (!function) break; - trace_seq_printf(taa_data->stack_seq, "\t\t-> %s\n", function); + trace_seq_printf(taa_data->stack_seq, " %.*s -> %s\n", + 14, spaces, function); } } return 0; @@ -568,23 +583,24 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay; if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration) { if (taa_data->prev_irq_timstamp < taa_data->timer_irq_start_time) - printf(" Previous IRQ interference: \t\t up to %9.2f us\n", - ns_to_usf(taa_data->prev_irq_duration)); + printf(" Previous IRQ interference: %.*s up to %9.2f us\n", + 16, spaces, + ns_to_usf(taa_data->prev_irq_duration)); } /* * The delay that the IRQ suffered before starting. */ - printf(" IRQ handler delay: %16s %9.2f us (%.2f %%)\n", - (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", - ns_to_usf(taa_data->timer_irq_start_delay), - ns_to_per(total, taa_data->timer_irq_start_delay)); + printf(" IRQ handler delay: %.*s %16s %9.2f us (%.2f %%)\n", 16, spaces, + (ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "", + ns_to_usf(taa_data->timer_irq_start_delay), + ns_to_per(total, taa_data->timer_irq_start_delay)); /* * Timerlat IRQ. */ - printf(" IRQ latency: \t\t\t\t %9.2f us\n", - ns_to_usf(taa_data->tlat_irq_latency)); + printf(" IRQ latency: %.*s %9.2f us\n", 40, spaces, + ns_to_usf(taa_data->tlat_irq_latency)); if (irq) { /* @@ -595,15 +611,16 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * so it will be displayed, it is the key. */ printf(" Blocking thread:\n"); - printf(" %24s:%-9llu\n", - taa_data->run_thread_comm, taa_data->run_thread_pid); + printf(" %.*s %24s:%-9llu\n", 6, spaces, taa_data->run_thread_comm, + taa_data->run_thread_pid); } else { /* * The duration of the IRQ handler that handled the timerlat IRQ. */ - printf(" Timerlat IRQ duration: \t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->timer_irq_duration), - ns_to_per(total, taa_data->timer_irq_duration)); + printf(" Timerlat IRQ duration: %.*s %9.2f us (%.2f %%)\n", + 30, spaces, + ns_to_usf(taa_data->timer_irq_duration), + ns_to_per(total, taa_data->timer_irq_duration)); /* * The amount of time that the current thread postponed the scheduler. @@ -611,13 +628,13 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Recalling that it is net from NMI/IRQ/Softirq interference, so there * is no need to compute values here. */ - printf(" Blocking thread: \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_blocking_duration), - ns_to_per(total, taa_data->thread_blocking_duration)); + printf(" Blocking thread: %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_blocking_duration), + ns_to_per(total, taa_data->thread_blocking_duration)); - printf(" %24s:%-9llu %9.2f us\n", - taa_data->run_thread_comm, taa_data->run_thread_pid, - ns_to_usf(taa_data->thread_blocking_duration)); + printf(" %.*s %24s:%-9llu %.*s %9.2f us\n", 6, spaces, + taa_data->run_thread_comm, taa_data->run_thread_pid, + 12, spaces, ns_to_usf(taa_data->thread_blocking_duration)); } /* @@ -629,9 +646,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * NMIs can happen during the IRQ, so they are always possible. */ if (taa_data->thread_nmi_sum) - printf(" NMI interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_nmi_sum), - ns_to_per(total, taa_data->thread_nmi_sum)); + printf(" NMI interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_nmi_sum), + ns_to_per(total, taa_data->thread_nmi_sum)); /* * If it is an IRQ latency, the other factors can be skipped. @@ -643,9 +660,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Prints the interference caused by IRQs to the thread latency. */ if (taa_data->thread_irq_sum) { - printf(" IRQ interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_irq_sum), - ns_to_per(total, taa_data->thread_irq_sum)); + printf(" IRQ interference %.*s %9.2f us (%.2f %%)\n", 36, spaces, + ns_to_usf(taa_data->thread_irq_sum), + ns_to_per(total, taa_data->thread_irq_sum)); trace_seq_do_printf(taa_data->irqs_seq); } @@ -654,9 +671,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * Prints the interference caused by Softirqs to the thread latency. */ if (taa_data->thread_softirq_sum) { - printf(" Softirq interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_softirq_sum), - ns_to_per(total, taa_data->thread_softirq_sum)); + printf(" Softirq interference %.*s %9.2f us (%.2f %%)\n", 32, spaces, + ns_to_usf(taa_data->thread_softirq_sum), + ns_to_per(total, taa_data->thread_softirq_sum)); trace_seq_do_printf(taa_data->softirqs_seq); } @@ -670,9 +687,9 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, * timer handling latency. */ if (taa_data->thread_thread_sum) { - printf(" Thread interference \t\t\t %9.2f us (%.2f %%)\n", - ns_to_usf(taa_data->thread_thread_sum), - ns_to_per(total, taa_data->thread_thread_sum)); + printf(" Thread interference %.*s %9.2f us (%.2f %%)\n", 33, spaces, + ns_to_usf(taa_data->thread_thread_sum), + ns_to_per(total, taa_data->thread_thread_sum)); trace_seq_do_printf(taa_data->threads_seq); } @@ -682,8 +699,8 @@ static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu, */ print_total: printf("------------------------------------------------------------------------\n"); - printf(" %s latency: \t\t\t %9.2f us (100%%)\n", irq ? "IRQ" : "Thread", - ns_to_usf(total)); + printf(" %s latency: %.*s %9.2f us (100%%)\n", irq ? " IRQ" : "Thread", + 37, spaces, ns_to_usf(total)); } static int timerlat_auto_analysis_collect_trace(struct timerlat_aa_context *taa_ctx) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 8bd51aab6513..a3907c390d67 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -40,6 +40,7 @@ struct timerlat_hist_params { int no_aa; int dump_tasks; int user_workload; + int kernel_workload; int user_hist; cpu_set_t hk_cpu_set; struct sched_attr sched_param; @@ -52,6 +53,8 @@ struct timerlat_hist_params { char with_zeros; int bucket_size; int entries; + int warmup; + int buffer_size; }; struct timerlat_hist_cpu { @@ -324,17 +327,29 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_irq); + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_irq); + else + trace_seq_printf(trace->seq, " - "); + } - if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_thread); + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_thread); + else + trace_seq_printf(trace->seq, " - "); + } - if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_user); + if (params->user_hist) { + if (data->hist[cpu].user_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_user); + else + trace_seq_printf(trace->seq, " - "); + } } trace_seq_printf(trace->seq, "\n"); @@ -384,25 +399,164 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_irq); + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_irq); + else + trace_seq_printf(trace->seq, " - "); + } - if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_thread); + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_thread); + else + trace_seq_printf(trace->seq, " - "); + } - if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_user); + if (params->user_hist) { + if (data->hist[cpu].user_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_user); + else + trace_seq_printf(trace->seq, " - "); + } } trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); trace_seq_reset(trace->seq); } +static void +timerlat_print_stats_all(struct timerlat_hist_params *params, + struct trace_instance *trace, + struct timerlat_hist_data *data) +{ + struct timerlat_hist_cpu *cpu_data; + struct timerlat_hist_cpu sum; + int cpu; + + if (params->no_summary) + return; + + memset(&sum, 0, sizeof(sum)); + sum.min_irq = ~0; + sum.min_thread = ~0; + sum.min_user = ~0; + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !CPU_ISSET(cpu, ¶ms->monitored_cpus)) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + cpu_data = &data->hist[cpu]; + + sum.irq_count += cpu_data->irq_count; + update_min(&sum.min_irq, &cpu_data->min_irq); + update_sum(&sum.sum_irq, &cpu_data->sum_irq); + update_max(&sum.max_irq, &cpu_data->max_irq); + + sum.thread_count += cpu_data->thread_count; + update_min(&sum.min_thread, &cpu_data->min_thread); + update_sum(&sum.sum_thread, &cpu_data->sum_thread); + update_max(&sum.max_thread, &cpu_data->max_thread); + + sum.user_count += cpu_data->user_count; + update_min(&sum.min_user, &cpu_data->min_user); + update_sum(&sum.sum_user, &cpu_data->sum_user); + update_max(&sum.max_user, &cpu_data->max_user); + } + + if (!params->no_index) + trace_seq_printf(trace->seq, "ALL: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, " IRQ"); + + if (!params->no_thread) + trace_seq_printf(trace->seq, " Thr"); + + if (params->user_hist) + trace_seq_printf(trace->seq, " Usr"); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "count:"); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9d ", + sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9d ", + sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9d ", + sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "min: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.min_user); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "avg: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_irq / sum.irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_thread / sum.thread_count); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.sum_user / sum.user_count); + + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "max: "); + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_thread); + + if (params->user_hist) + trace_seq_printf(trace->seq, "%9llu ", + sum.max_user); + + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + /* - * timerlat_print_stats - print data for all CPUs + * timerlat_print_stats - print data for each CPUs */ static void timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *tool) @@ -485,6 +639,7 @@ timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *t trace_seq_reset(trace->seq); timerlat_print_summary(params, trace, data); + timerlat_print_stats_all(params, trace, data); } /* @@ -497,9 +652,10 @@ static void timerlat_hist_usage(char *usage) char *msg[] = { "", " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", - " [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", + " [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u]", + " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", + " [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -513,7 +669,7 @@ static void timerlat_hist_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", " -D/--debug: print debug info", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -534,8 +690,11 @@ static void timerlat_hist_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -597,6 +756,7 @@ static struct timerlat_hist_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"event", required_argument, 0, 'e'}, {"no-irq", no_argument, 0, '0'}, @@ -610,13 +770,15 @@ static struct timerlat_hist_params {"dma-latency", required_argument, 0, '8'}, {"no-aa", no_argument, 0, '9'}, {"dump-task", no_argument, 0, '\1'}, + {"warm-up", required_argument, 0, '\2'}, + {"trace-buffer-size", required_argument, 0, '\3'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1", + c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3", long_options, &option_index); /* detect the end of the options. */ @@ -699,6 +861,9 @@ static struct timerlat_hist_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = 1; + break; case 'n': params->output_divisor = 1; break; @@ -720,9 +885,13 @@ static struct timerlat_hist_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; break; @@ -785,6 +954,12 @@ static struct timerlat_hist_params case '\1': params->dump_tasks = 1; break; + case '\2': + params->warmup = get_llong_from_str(optarg); + break; + case '\3': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_hist_usage("Invalid option"); } @@ -807,6 +982,9 @@ static struct timerlat_hist_params if (!params->stop_us && !params->stop_total_us) params->no_aa = 1; + if (params->kernel_workload && params->user_workload) + timerlat_hist_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -882,6 +1060,22 @@ timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_param auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_hist = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_hist) { retval = osnoise_set_workload(tool->context, 0); if (retval) { @@ -1019,6 +1213,12 @@ int timerlat_hist_main(int argc, char *argv[]) if (retval) goto out_hist; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_hist; + } } if (!params->no_aa) { @@ -1039,22 +1239,6 @@ int timerlat_hist_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - tool->start_time = time(NULL); - timerlat_hist_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -1074,6 +1258,29 @@ int timerlat_hist_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + if (stop_tracing) + goto out_hist; + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + tool->start_time = time(NULL); + timerlat_hist_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); @@ -1099,6 +1306,7 @@ int timerlat_hist_main(int argc, char *argv[]) } } } + if (params->user_workload && !params_u.stopped_running) { params_u.should_run = 0; sleep(1); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 8a3fa64319c6..8c16419fe22a 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -44,6 +44,10 @@ struct timerlat_top_params { int hk_cpus; int user_top; int user_workload; + int kernel_workload; + int pretty_output; + int warmup; + int buffer_size; cpu_set_t hk_cpu_set; struct sched_attr sched_param; struct trace_events *events; @@ -118,6 +122,37 @@ cleanup: return NULL; } +static void +timerlat_top_reset_sum(struct timerlat_top_cpu *summary) +{ + memset(summary, 0, sizeof(*summary)); + summary->min_irq = ~0; + summary->min_thread = ~0; + summary->min_user = ~0; +} + +static void +timerlat_top_update_sum(struct osnoise_tool *tool, int cpu, struct timerlat_top_cpu *sum) +{ + struct timerlat_top_data *data = tool->data; + struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu]; + + sum->irq_count += cpu_data->irq_count; + update_min(&sum->min_irq, &cpu_data->min_irq); + update_sum(&sum->sum_irq, &cpu_data->sum_irq); + update_max(&sum->max_irq, &cpu_data->max_irq); + + sum->thread_count += cpu_data->thread_count; + update_min(&sum->min_thread, &cpu_data->min_thread); + update_sum(&sum->sum_thread, &cpu_data->sum_thread); + update_max(&sum->max_thread, &cpu_data->max_thread); + + sum->user_count += cpu_data->user_count; + update_min(&sum->min_user, &cpu_data->min_user); + update_sum(&sum->sum_user, &cpu_data->sum_user); + update_max(&sum->max_user, &cpu_data->max_user); +} + /* * timerlat_hist_update - record a new timerlat occurent on cpu, updating data */ @@ -179,19 +214,22 @@ timerlat_top_handler(struct trace_seq *s, struct tep_record *record, /* * timerlat_top_header - print the header of the tool output */ -static void timerlat_top_header(struct osnoise_tool *top) +static void timerlat_top_header(struct timerlat_top_params *params, struct osnoise_tool *top) { - struct timerlat_top_params *params = top->params; struct trace_seq *s = top->trace.seq; char duration[26]; get_duration(top->start_time, duration, sizeof(duration)); - trace_seq_printf(s, "\033[2;37;40m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;37;40m"); + trace_seq_printf(s, " Timer Latency "); if (params->user_top) trace_seq_printf(s, " "); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); trace_seq_printf(s, "%-6s | IRQ Timer Latency (%s) | Thread Timer Latency (%s)", duration, @@ -204,14 +242,20 @@ static void timerlat_top_header(struct osnoise_tool *top) } trace_seq_printf(s, "\n"); - trace_seq_printf(s, "\033[2;30;47m"); + if (params->pretty_output) + trace_seq_printf(s, "\033[2;30;47m"); + trace_seq_printf(s, "CPU COUNT | cur min avg max | cur min avg max"); if (params->user_top) trace_seq_printf(s, " | cur min avg max"); - trace_seq_printf(s, "\033[0;0;0m"); + + if (params->pretty_output) + trace_seq_printf(s, "\033[0;0;0m"); trace_seq_printf(s, "\n"); } +static const char *no_value = " -"; + /* * timerlat_top_print - prints the output of a given CPU */ @@ -239,10 +283,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) trace_seq_printf(s, "%3d #%-9d |", cpu, cpu_data->irq_count); if (!cpu_data->irq_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - |"); + trace_seq_printf(s, "%s %s %s %s |", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_irq / params->output_divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_irq / params->output_divisor); @@ -251,10 +292,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } if (!cpu_data->thread_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " -\n"); + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_thread / divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_thread / divisor); @@ -271,10 +309,7 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) trace_seq_printf(s, " |"); if (!cpu_data->user_count) { - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " - "); - trace_seq_printf(s, " -\n"); + trace_seq_printf(s, "%s %s %s %s\n", no_value, no_value, no_value, no_value); } else { trace_seq_printf(s, "%9llu ", cpu_data->cur_user / divisor); trace_seq_printf(s, "%9llu ", cpu_data->min_user / divisor); @@ -285,6 +320,77 @@ static void timerlat_top_print(struct osnoise_tool *top, int cpu) } /* + * timerlat_top_print_sum - prints the summary output + */ +static void +timerlat_top_print_sum(struct osnoise_tool *top, struct timerlat_top_cpu *summary) +{ + const char *split = "----------------------------------------"; + struct timerlat_top_params *params = top->params; + unsigned long long count = summary->irq_count; + int divisor = params->output_divisor; + struct trace_seq *s = top->trace.seq; + int e = 0; + + if (divisor == 0) + return; + + /* + * Skip if no data is available: is this cpu offline? + */ + if (!summary->irq_count && !summary->thread_count) + return; + + while (count > 999999) { + e++; + count /= 10; + } + + trace_seq_printf(s, "%.*s|%.*s|%.*s", 15, split, 40, split, 39, split); + if (params->user_top) + trace_seq_printf(s, "-|%.*s", 39, split); + trace_seq_printf(s, "\n"); + + trace_seq_printf(s, "ALL #%-6llu e%d |", count, e); + + if (!summary->irq_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_irq / params->output_divisor); + trace_seq_printf(s, "%9llu ", (summary->sum_irq / summary->irq_count) / divisor); + trace_seq_printf(s, "%9llu |", summary->max_irq / divisor); + } + + if (!summary->thread_count) { + trace_seq_printf(s, "%s %s %s %s", no_value, no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_thread / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_thread / summary->thread_count) / divisor); + trace_seq_printf(s, "%9llu", summary->max_thread / divisor); + } + + if (!params->user_top) { + trace_seq_printf(s, "\n"); + return; + } + + trace_seq_printf(s, " |"); + + if (!summary->user_count) { + trace_seq_printf(s, " %s %s %s |", no_value, no_value, no_value); + } else { + trace_seq_printf(s, " "); + trace_seq_printf(s, "%9llu ", summary->min_user / divisor); + trace_seq_printf(s, "%9llu ", + (summary->sum_user / summary->user_count) / divisor); + trace_seq_printf(s, "%9llu\n", summary->max_user / divisor); + } +} + +/* * clear_terminal - clears the output terminal */ static void clear_terminal(struct trace_seq *seq) @@ -300,6 +406,7 @@ static void timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *top) { struct trace_instance *trace = &top->trace; + struct timerlat_top_cpu summary; static int nr_cpus = -1; int i; @@ -312,14 +419,19 @@ timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *to if (!params->quiet) clear_terminal(trace->seq); - timerlat_top_header(top); + timerlat_top_reset_sum(&summary); + + timerlat_top_header(params, top); for (i = 0; i < nr_cpus; i++) { if (params->cpus && !CPU_ISSET(i, ¶ms->monitored_cpus)) continue; timerlat_top_print(top, i); + timerlat_top_update_sum(top, i, &summary); } + timerlat_top_print_sum(top, &summary); + trace_seq_do_printf(trace->seq); trace_seq_reset(trace->seq); } @@ -334,8 +446,8 @@ static void timerlat_top_usage(char *usage) static const char *const msg[] = { "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", - " [[-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", - " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u]", + " [[-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", + " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u|-k] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -350,7 +462,7 @@ static void timerlat_top_usage(char *usage) " -d/--duration time[m|h|d]: duration of the session in seconds", " -D/--debug: print debug info", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", - " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <command>: enable a trace event filter to the previous -e event", " --trigger <command>: enable a trace event trigger to the previous -e event", @@ -364,8 +476,11 @@ static void timerlat_top_usage(char *usage) " f:prio - use SCHED_FIFO with prio", " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", " in nanoseconds", - " -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads", + " -u/--user-threads: use rtla user-space threads instead of kernel-space timerlat threads", + " -k/--kernel-threads: use timerlat kernel-space threads instead of rtla user-space threads", " -U/--user-load: enable timerlat for user-defined user-space workload", + " --warm-up s: let the workload run for s seconds before collecting data", + " --trace-buffer-size kB: set the per-cpu trace buffer size in kB", NULL, }; @@ -425,6 +540,7 @@ static struct timerlat_top_params {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, + {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, {"trigger", required_argument, 0, '0'}, {"filter", required_argument, 0, '1'}, @@ -432,13 +548,15 @@ static struct timerlat_top_params {"no-aa", no_argument, 0, '3'}, {"dump-tasks", no_argument, 0, '4'}, {"aa-only", required_argument, 0, '5'}, + {"warm-up", required_argument, 0, '6'}, + {"trace-buffer-size", required_argument, 0, '7'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:", + c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", long_options, &option_index); /* detect the end of the options. */ @@ -523,6 +641,9 @@ static struct timerlat_top_params case 'i': params->stop_us = get_llong_from_str(optarg); break; + case 'k': + params->kernel_workload = true; + break; case 'n': params->output_divisor = 1; break; @@ -547,9 +668,13 @@ static struct timerlat_top_params params->stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) - /* skip = */ - params->trace_output = &optarg[1]; + if (optarg) { + if (optarg[0] == '=') + params->trace_output = &optarg[1]; + else + params->trace_output = &optarg[0]; + } else if (optind < argc && argv[optind][0] != '-') + params->trace_output = argv[optind]; else params->trace_output = "timerlat_trace.txt"; @@ -595,6 +720,12 @@ static struct timerlat_top_params case '4': params->dump_tasks = 1; break; + case '6': + params->warmup = get_llong_from_str(optarg); + break; + case '7': + params->buffer_size = get_llong_from_str(optarg); + break; default: timerlat_top_usage("Invalid option"); } @@ -614,6 +745,9 @@ static struct timerlat_top_params if (params->no_aa && params->aa_only) timerlat_top_usage("--no-aa and --aa-only are mutually exclusive!"); + if (params->kernel_workload && params->user_workload) + timerlat_top_usage("--kernel-threads and --user-threads are mutually exclusive!"); + return params; } @@ -692,6 +826,22 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * auto_house_keeping(¶ms->monitored_cpus); } + /* + * If the user did not specify a type of thread, try user-threads first. + * Fall back to kernel threads otherwise. + */ + if (!params->kernel_workload && !params->user_workload) { + retval = tracefs_file_exists(NULL, "osnoise/per_cpu/cpu0/timerlat_fd"); + if (retval) { + debug_msg("User-space interface detected, setting user-threads\n"); + params->user_workload = 1; + params->user_top = 1; + } else { + debug_msg("User-space interface not detected, setting kernel-threads\n"); + params->kernel_workload = 1; + } + } + if (params->user_top) { retval = osnoise_set_workload(top->context, 0); if (retval) { @@ -700,6 +850,9 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params * } } + if (isatty(1) && !params->quiet) + params->pretty_output = 1; + return 0; out_err: @@ -830,6 +983,12 @@ int timerlat_top_main(int argc, char *argv[]) if (retval) goto out_top; } + + if (params->buffer_size > 0) { + retval = trace_set_buffer_size(&record->trace, params->buffer_size); + if (retval) + goto out_top; + } } if (!params->no_aa) { @@ -859,22 +1018,6 @@ int timerlat_top_main(int argc, char *argv[]) } } - /* - * Start the tracers here, after having set all instances. - * - * Let the trace instance start first for the case of hitting a stop - * tracing while enabling other instances. The trace instance is the - * one with most valuable information. - */ - if (params->trace_output) - trace_instance_start(&record->trace); - if (!params->no_aa && aa != top) - trace_instance_start(&aa->trace); - trace_instance_start(trace); - - top->start_time = time(NULL); - timerlat_top_set_signals(params); - if (params->user_workload) { /* rtla asked to stop */ params_u.should_run = 1; @@ -894,6 +1037,27 @@ int timerlat_top_main(int argc, char *argv[]) err_msg("Error creating timerlat user-space threads\n"); } + if (params->warmup > 0) { + debug_msg("Warming up for %d seconds\n", params->warmup); + sleep(params->warmup); + } + + /* + * Start the tracers here, after having set all instances. + * + * Let the trace instance start first for the case of hitting a stop + * tracing while enabling other instances. The trace instance is the + * one with most valuable information. + */ + if (params->trace_output) + trace_instance_start(&record->trace); + if (!params->no_aa && aa != top) + trace_instance_start(&aa->trace); + trace_instance_start(trace); + + top->start_time = time(NULL); + timerlat_top_set_signals(params); + while (!stop_tracing) { sleep(params->sleep_time); diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index e1ba6d9f4265..170a706248ab 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -540,3 +540,18 @@ int trace_is_off(struct trace_instance *tool, struct trace_instance *trace) return 0; } + +/* + * trace_set_buffer_size - set the per-cpu tracing buffer size. + */ +int trace_set_buffer_size(struct trace_instance *trace, int size) +{ + int retval; + + debug_msg("Setting trace buffer size to %d Kb\n", size); + retval = tracefs_instance_set_buffer_size(trace->inst, size, -1); + if (retval) + err_msg("Error setting trace buffer size\n"); + + return retval; +} diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h index 2e9a89a25615..c7c92dc9a18a 100644 --- a/tools/tracing/rtla/src/trace.h +++ b/tools/tracing/rtla/src/trace.h @@ -48,3 +48,4 @@ int trace_events_enable(struct trace_instance *instance, int trace_event_add_filter(struct trace_events *event, char *filter); int trace_event_add_trigger(struct trace_events *event, char *trigger); int trace_is_off(struct trace_instance *tool, struct trace_instance *trace); +int trace_set_buffer_size(struct trace_instance *trace, int size); diff --git a/tools/writeback/wb_monitor.py b/tools/writeback/wb_monitor.py new file mode 100644 index 000000000000..5e3591f1f9a9 --- /dev/null +++ b/tools/writeback/wb_monitor.py @@ -0,0 +1,172 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Kemeng Shi <shikemeng@huaweicloud.com> +# Copyright (C) 2024 Huawei Inc + +desc = """ +This is a drgn script based on wq_monitor.py to monitor writeback info on +backing dev. For more info on drgn, visit https://github.com/osandov/drgn. + + writeback(kB) Amount of dirty pages are currently being written back to + disk. + + reclaimable(kB) Amount of pages are currently reclaimable. + + dirtied(kB) Amount of pages have been dirtied. + + wrttien(kB) Amount of dirty pages have been written back to disk. + + avg_wb(kBps) Smoothly estimated write bandwidth of writing dirty pages + back to disk. +""" + +import signal +import re +import time +import json + +import drgn +from drgn.helpers.linux.list import list_for_each_entry + +import argparse +parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument('bdi', metavar='REGEX', nargs='*', + help='Target backing device name patterns (all if empty)') +parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, + help='Monitoring interval (0 to print once and exit)') +parser.add_argument('-j', '--json', action='store_true', + help='Output in json') +parser.add_argument('-c', '--cgroup', action='store_true', + help='show writeback of bdi in cgroup') +args = parser.parse_args() + +bdi_list = prog['bdi_list'] + +WB_RECLAIMABLE = prog['WB_RECLAIMABLE'] +WB_WRITEBACK = prog['WB_WRITEBACK'] +WB_DIRTIED = prog['WB_DIRTIED'] +WB_WRITTEN = prog['WB_WRITTEN'] +NR_WB_STAT_ITEMS = prog['NR_WB_STAT_ITEMS'] + +PAGE_SHIFT = prog['PAGE_SHIFT'] + +def K(x): + return x << (PAGE_SHIFT - 10) + +class Stats: + def dict(self, now): + return { 'timestamp' : now, + 'name' : self.name, + 'writeback' : self.stats[WB_WRITEBACK], + 'reclaimable' : self.stats[WB_RECLAIMABLE], + 'dirtied' : self.stats[WB_DIRTIED], + 'written' : self.stats[WB_WRITTEN], + 'avg_wb' : self.avg_bw, } + + def table_header_str(): + return f'{"":>16} {"writeback":>10} {"reclaimable":>12} ' \ + f'{"dirtied":>9} {"written":>9} {"avg_bw":>9}' + + def table_row_str(self): + out = f'{self.name[-16:]:16} ' \ + f'{self.stats[WB_WRITEBACK]:10} ' \ + f'{self.stats[WB_RECLAIMABLE]:12} ' \ + f'{self.stats[WB_DIRTIED]:9} ' \ + f'{self.stats[WB_WRITTEN]:9} ' \ + f'{self.avg_bw:9} ' + return out + + def show_header(): + if Stats.table_fmt: + print() + print(Stats.table_header_str()) + + def show_stats(self): + if Stats.table_fmt: + print(self.table_row_str()) + else: + print(self.dict(Stats.now)) + +class WbStats(Stats): + def __init__(self, wb): + bdi_name = wb.bdi.dev_name.string_().decode() + # avoid to use bdi.wb.memcg_css which is only defined when + # CONFIG_CGROUP_WRITEBACK is enabled + if wb == wb.bdi.wb.address_of_(): + ino = "1" + else: + ino = str(wb.memcg_css.cgroup.kn.id.value_()) + self.name = bdi_name + '_' + ino + + self.stats = [0] * NR_WB_STAT_ITEMS + for i in range(NR_WB_STAT_ITEMS): + if wb.stat[i].count >= 0: + self.stats[i] = int(K(wb.stat[i].count)) + else: + self.stats[i] = 0 + + self.avg_bw = int(K(wb.avg_write_bandwidth)) + +class BdiStats(Stats): + def __init__(self, bdi): + self.name = bdi.dev_name.string_().decode() + self.stats = [0] * NR_WB_STAT_ITEMS + self.avg_bw = 0 + + def collectStats(self, wb_stats): + for i in range(NR_WB_STAT_ITEMS): + self.stats[i] += wb_stats.stats[i] + + self.avg_bw += wb_stats.avg_bw + +exit_req = False + +def sigint_handler(signr, frame): + global exit_req + exit_req = True + +def main(): + # handle args + Stats.table_fmt = not args.json + interval = args.interval + cgroup = args.cgroup + + re_str = None + if args.bdi: + for r in args.bdi: + if re_str is None: + re_str = r + else: + re_str += '|' + r + + filter_re = re.compile(re_str) if re_str else None + + # monitoring loop + signal.signal(signal.SIGINT, sigint_handler) + + while not exit_req: + Stats.now = time.time() + + Stats.show_header() + for bdi in list_for_each_entry('struct backing_dev_info', bdi_list.address_of_(), 'bdi_list'): + bdi_stats = BdiStats(bdi) + if filter_re and not filter_re.search(bdi_stats.name): + continue + + for wb in list_for_each_entry('struct bdi_writeback', bdi.wb_list.address_of_(), 'bdi_node'): + wb_stats = WbStats(wb) + bdi_stats.collectStats(wb_stats) + if cgroup: + wb_stats.show_stats() + + bdi_stats.show_stats() + if cgroup and Stats.table_fmt: + print() + + if interval == 0: + break + time.sleep(interval) + +if __name__ == "__main__": + main() |