From 929ad065ba2967be238dfdc0895b79fda62c7f16 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Apr 2024 11:13:56 +0200 Subject: locking/atomic/x86: Correct the definition of __arch_try_cmpxchg128() Correct the definition of __arch_try_cmpxchg128(), introduced by: b23e139d0b66 ("arch: Introduce arch_{,try_}_cmpxchg128{,_local}()") Fixes: b23e139d0b66 ("arch: Introduce arch_{,try_}_cmpxchg128{,_local}()") Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240408091547.90111-2-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 44b08b53ab32..c1d6cd58f809 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -62,7 +62,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, asm volatile(_lock "cmpxchg16b %[ptr]" \ CC_SET(e) \ : CC_OUT(e) (ret), \ - [ptr] "+m" (*ptr), \ + [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high) \ : "memory"); \ -- cgit v1.2.3 From 7016cc5def44b9dcb28089efae4412fa0d6c78c2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Apr 2024 11:13:57 +0200 Subject: locking/atomic/x86: Modernize x86_32 arch_{,try_}_cmpxchg64{,_local}() Commit: b23e139d0b66 ("arch: Introduce arch_{,try_}_cmpxchg128{,_local}()") introduced arch_{,try_}_cmpxchg128{,_local}() for x86_64 targets. Modernize existing x86_32 arch_{,try_}_cmpxchg64{,_local}() definitions to follow the same structure as the definitions introduced by the above commit. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240408091547.90111-3-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 179 +++++++++++++++++++++----------------- 1 file changed, 100 insertions(+), 79 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index b5731c51f0f4..fe40d0681ea8 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -3,103 +3,124 @@ #define _ASM_X86_CMPXCHG_32_H /* - * Note: if you use set64_bit(), __cmpxchg64(), or their variants, + * Note: if you use __cmpxchg64(), or their variants, * you need to test for the feature in boot_cpu_data. */ -#ifdef CONFIG_X86_CMPXCHG64 -#define arch_cmpxchg64(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_cmpxchg64_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_try_cmpxchg64(ptr, po, n) \ - __try_cmpxchg64((ptr), (unsigned long long *)(po), \ - (unsigned long long)(n)) -#endif +union __u64_halves { + u64 full; + struct { + u32 low, high; + }; +}; + +#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) -static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) + +static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - u64 prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX); } -static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - u64 prev; - asm volatile("cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_cmpxchg64(ptr, old, new,); } -static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - bool success; - u64 old = *pold; - asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" - CC_SET(z) - : CC_OUT(z) (success), - [ptr] "+m" (*ptr), - "+A" (old) - : "b" ((u32)new), - "c" ((u32)(new >> 32)) - : "memory"); - - if (unlikely(!success)) - *pold = old; - return success; + return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); } -#ifndef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CMPXCHG64 + +#define arch_cmpxchg64 __cmpxchg64 + +#define arch_cmpxchg64_local __cmpxchg64_local + +#define arch_try_cmpxchg64 __try_cmpxchg64 + +#else + /* * Building a kernel capable running on 80386 and 80486. It may be necessary * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define arch_cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io(LOCK_PREFIX_HERE \ - "call cmpxchg8b_emu", \ - "lock; cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) - - -#define arch_cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io("call cmpxchg8b_emu", \ - "cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) +#define __arch_cmpxchg64_emu(_ptr, _old, _new) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + "call cmpxchg8b_emu", \ + "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new); +} +#define arch_cmpxchg64 arch_cmpxchg64 + +#define __arch_cmpxchg64_emu_local(_ptr, _old, _new) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ + "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu_local(ptr, old, new); +} +#define arch_cmpxchg64_local arch_cmpxchg64_local #endif -- cgit v1.2.3 From aef95dac9ce4f271cc43195ffc175114ed934cbe Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Apr 2024 11:13:58 +0200 Subject: locking/atomic/x86: Introduce arch_try_cmpxchg64() for !CONFIG_X86_CMPXCHG64 Commit: 6d12c8d308e68 ("percpu: Wire up cmpxchg128") improved emulated cmpxchg8b_emu() library function to return success/failure in a ZF flag. Define arch_try_cmpxchg64() for !CONFIG_X86_CMPXCHG64 targets to override the generic archy_try_cmpxchg() with an optimized target specific implementation that handles ZF flag. The assembly code at the call sites improves from: bf56d: e8 fc ff ff ff call cmpxchg8b_emu bf572: 8b 74 24 28 mov 0x28(%esp),%esi bf576: 89 c3 mov %eax,%ebx bf578: 89 d1 mov %edx,%ecx bf57a: 8b 7c 24 2c mov 0x2c(%esp),%edi bf57e: 89 f0 mov %esi,%eax bf580: 89 fa mov %edi,%edx bf582: 31 d8 xor %ebx,%eax bf584: 31 ca xor %ecx,%edx bf586: 09 d0 or %edx,%eax bf588: 0f 84 e3 01 00 00 je bf771 <...> to: bf572: e8 fc ff ff ff call cmpxchg8b_emu bf577: 0f 84 b6 01 00 00 je bf733 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240408091547.90111-4-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fe40d0681ea8..9e0d330dd5d0 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -122,6 +122,34 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 } #define arch_cmpxchg64_local arch_cmpxchg64_local +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + "call cmpxchg8b_emu", \ + "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new); +} +#define arch_try_cmpxchg64 arch_try_cmpxchg64 + #endif #define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) -- cgit v1.2.3 From 276b893049e4cdc2f33c009706a75ec18a114485 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:33 +0200 Subject: locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32 Introduce arch_atomic64_try_cmpxchg() for 32-bit targets to use optimized target specific implementation instead of a generic one. This implementation eliminates dual-word compare after cmpxchg8b instruction and improves generated asm code from: 2273: f0 0f c7 0f lock cmpxchg8b (%edi) 2277: 8b 74 24 2c mov 0x2c(%esp),%esi 227b: 89 d3 mov %edx,%ebx 227d: 89 c2 mov %eax,%edx 227f: 89 5c 24 10 mov %ebx,0x10(%esp) 2283: 8b 7c 24 30 mov 0x30(%esp),%edi 2287: 89 44 24 1c mov %eax,0x1c(%esp) 228b: 31 f2 xor %esi,%edx 228d: 89 d0 mov %edx,%eax 228f: 89 da mov %ebx,%edx 2291: 31 fa xor %edi,%edx 2293: 09 d0 or %edx,%eax 2295: 0f 85 a5 00 00 00 jne 2340 <...> to: 2270: f0 0f c7 0f lock cmpxchg8b (%edi) 2274: 0f 85 a6 00 00 00 jne 2320 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-1-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 3486d91b8595..ec217aaf41eb 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -61,12 +61,18 @@ ATOMIC64_DECL(add_unless); #undef __ATOMIC64_DECL #undef ATOMIC64_EXPORT -static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - return arch_cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_try_cmpxchg64(&v->counter, old, new); +} +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg + static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; -- cgit v1.2.3 From e73c4e34a0e9e3dfcb4e5ee4ccd3039a7b603218 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:34 +0200 Subject: locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() to x86_32 Introduce arch_atomic64_read_nonatomic() for 32-bit targets to load the value from atomic64_t location in a non-atomic way. This function is intended to be used in cases where a subsequent atomic operation will handle the torn value, and can be used to prime the first iteration of unconditional try_cmpxchg() loops. Suggested-by: Mark Rutland Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-2-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index ec217aaf41eb..bbdf174de110 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,6 +14,32 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +/* + * Read an atomic64_t non-atomically. + * + * This is intended to be used in cases where a subsequent atomic operation + * will handle the torn value, and can be used to prime the first iteration + * of unconditional try_cmpxchg() loops, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + * + * This is NOT safe to use where the value is not always checked by a + * subsequent atomic operation, such as in conditional try_cmpxchg() loops + * that can break before the atomic operation, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { + * if (condition(val)) + * break; + * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + */ +static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) +{ + /* See comment in arch_atomic_read(). */ + return __READ_ONCE(v->counter); +} + #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) #ifndef ATOMIC64_EXPORT #define ATOMIC64_DECL_ONE __ATOMIC64_DECL -- cgit v1.2.3 From 95ece48165c136b96fae0f6144f55cbf8b24aeb9 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:35 +0200 Subject: locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to use arch_atomic64_try_cmpxchg(). This implementation avoids one extra trip through the CMPXCHG loop. The value preload before the cmpxchg loop does not need to be atomic. Use arch_atomic64_read_nonatomic(v) to load the value from atomic_t location in a non-atomic way. The generated code improves from: 1917d5: 31 c9 xor %ecx,%ecx 1917d7: 31 db xor %ebx,%ebx 1917d9: 89 4c 24 3c mov %ecx,0x3c(%esp) 1917dd: 8b 74 24 24 mov 0x24(%esp),%esi 1917e1: 89 c8 mov %ecx,%eax 1917e3: 89 5c 24 34 mov %ebx,0x34(%esp) 1917e7: 8b 7c 24 28 mov 0x28(%esp),%edi 1917eb: 21 ce and %ecx,%esi 1917ed: 89 74 24 4c mov %esi,0x4c(%esp) 1917f1: 21 df and %ebx,%edi 1917f3: 89 de mov %ebx,%esi 1917f5: 89 7c 24 50 mov %edi,0x50(%esp) 1917f9: 8b 54 24 4c mov 0x4c(%esp),%edx 1917fd: 8b 7c 24 2c mov 0x2c(%esp),%edi 191801: 8b 4c 24 50 mov 0x50(%esp),%ecx 191805: 89 d3 mov %edx,%ebx 191807: 89 f2 mov %esi,%edx 191809: f0 0f c7 0f lock cmpxchg8b (%edi) 19180d: 89 c1 mov %eax,%ecx 19180f: 8b 74 24 34 mov 0x34(%esp),%esi 191813: 89 d3 mov %edx,%ebx 191815: 89 44 24 4c mov %eax,0x4c(%esp) 191819: 8b 44 24 3c mov 0x3c(%esp),%eax 19181d: 89 df mov %ebx,%edi 19181f: 89 54 24 44 mov %edx,0x44(%esp) 191823: 89 ca mov %ecx,%edx 191825: 31 de xor %ebx,%esi 191827: 31 c8 xor %ecx,%eax 191829: 09 f0 or %esi,%eax 19182b: 75 ac jne 1917d9 <...> to: 1912ba: 8b 06 mov (%esi),%eax 1912bc: 8b 56 04 mov 0x4(%esi),%edx 1912bf: 89 44 24 3c mov %eax,0x3c(%esp) 1912c3: 89 c1 mov %eax,%ecx 1912c5: 23 4c 24 34 and 0x34(%esp),%ecx 1912c9: 89 d3 mov %edx,%ebx 1912cb: 23 5c 24 38 and 0x38(%esp),%ebx 1912cf: 89 54 24 40 mov %edx,0x40(%esp) 1912d3: 89 4c 24 2c mov %ecx,0x2c(%esp) 1912d7: 89 5c 24 30 mov %ebx,0x30(%esp) 1912db: 8b 5c 24 2c mov 0x2c(%esp),%ebx 1912df: 8b 4c 24 30 mov 0x30(%esp),%ecx 1912e3: f0 0f c7 0e lock cmpxchg8b (%esi) 1912e7: 0f 85 f3 02 00 00 jne 1915e0 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-3-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 43 ++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index bbdf174de110..40ff73b5ec55 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -227,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); } static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); - return old; + return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); } static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); - return old; + return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); } static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); - return old; + return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i)); - return old; + return val; } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -- cgit v1.2.3 From 21689e4bfb9ae8f8b45279c53faecaa5a056ffa5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:36 +0200 Subject: locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions There is no need to implement arch_atomic_sub() family of inline functions, corresponding macros can be directly implemented using arch_atomic_add() inlines with negated argument. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-4-ubizjak@gmail.com --- arch/x86/include/asm/atomic.h | 12 ++---------- arch/x86/include/asm/atomic64_64.h | 12 ++---------- 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 55a55ec04350..55b4d24356ea 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) } #define arch_atomic_add_return arch_atomic_add_return -static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) -{ - return arch_atomic_add_return(-i, v); -} -#define arch_atomic_sub_return arch_atomic_sub_return +#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { @@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) } #define arch_atomic_fetch_add arch_atomic_fetch_add -static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic_fetch_sub arch_atomic_fetch_sub +#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 3165c0feedf7..ae12acae5b06 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) } #define arch_atomic64_add_return arch_atomic64_add_return -static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) -{ - return arch_atomic64_add_return(-i, v); -} -#define arch_atomic64_sub_return arch_atomic64_sub_return +#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v) static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { @@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v) static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { -- cgit v1.2.3 From 91095666125a666c8f20c2323b742c53165c0325 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 12 Apr 2024 10:38:53 +0200 Subject: locking/pvqspinlock/x86: Remove redundant CMP after CMPXCHG in __raw_callee_save___pv_queued_spin_unlock() x86 CMPXCHG instruction returns success in the ZF flag. Remove redundant CMP instruction after CMPXCHG that performs the same check. Also update the function comment to mention the modern version of the equivalent C code. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240412083908.282802-1-ubizjak@gmail.com --- arch/x86/include/asm/qspinlock_paravirt.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index ef9697f20129..466af57b8ed6 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -25,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { - * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = _Q_LOCKED_VAL; * - * if (likely(lockval == _Q_LOCKED_VAL)) + * if (try_cmpxchg(&lock->locked, &lockval, 0)) * return; * pv_queued_spin_unlock_slowpath(lock, lockval); * } @@ -43,7 +43,6 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); "mov $0x1,%eax\n\t" \ "xor %edx,%edx\n\t" \ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ - "cmp $0x1,%al\n\t" \ "jne .slowpath\n\t" \ "pop %rdx\n\t" \ FRAME_END \ -- cgit v1.2.3 From d26e46f6bf329cfcc469878709baa41d3bfc7cc3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 14 Apr 2024 18:12:43 +0200 Subject: locking/atomic/x86: Introduce arch_try_cmpxchg64_local() Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets to improve code using cmpxchg64_local(). On 64-bit targets, the generated assembly improves from: 3e28: 31 c0 xor %eax,%eax 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13) 3e2f: 48 85 c0 test %rax,%rax 3e32: 0f 85 9f 00 00 00 jne 3ed7 <...> to: 3e28: 31 c0 xor %eax,%eax 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13) 3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...> where a TEST instruction after CMPXCHG is saved. The improvements for 32-bit targets are even more noticeable, because double-word compare after CMPXCHG8B gets eliminated. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Waiman Long Link: https://lore.kernel.org/r/20240414161257.49145-1-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/cmpxchg_64.h | 6 ++++++ 2 files changed, 40 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 9e0d330dd5d0..9dedc13d5a77 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -64,6 +64,11 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); } +static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64(ptr, oldp, new,); +} + #ifdef CONFIG_X86_CMPXCHG64 #define arch_cmpxchg64 __cmpxchg64 @@ -72,6 +77,8 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne #define arch_try_cmpxchg64 __try_cmpxchg64 +#define arch_try_cmpxchg64_local __try_cmpxchg64_local + #else /* @@ -150,6 +157,33 @@ static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 } #define arch_try_cmpxchg64 arch_try_cmpxchg64 +#define __arch_try_cmpxchg64_emu_local(_ptr, _oldp, _new) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ + "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu_local(ptr, oldp, new); +} +#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local + #endif #define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index c1d6cd58f809..5e241306db26 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,12 @@ arch_try_cmpxchg((ptr), (po), (n)); \ }) +#define arch_try_cmpxchg64_local(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg_local((ptr), (po), (n)); \ +}) + union __u128_halves { u128 full; struct { -- cgit v1.2.3 From 33eb8ab4ec83cf0975d0113966c7e71cd6be60b2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 17 Apr 2024 19:58:12 +0200 Subject: locking/atomic/x86: Merge __arch{,_try}_cmpxchg64_emu_local() with __arch{,_try}_cmpxchg64_emu() Macros __arch{,_try}_cmpxchg64_emu() are almost identical to their local variants __arch{,_try}_cmpxchg64_emu_local(), differing only by lock prefixes. Merge these two macros by introducing additional macro parameters to pass lock location and lock prefix from their respective static inline functions. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240417175830.161561-1-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 56 +++++++-------------------------------- 1 file changed, 10 insertions(+), 46 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 9dedc13d5a77..ed2797f132ce 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -86,14 +86,14 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define __arch_cmpxchg64_emu(_ptr, _old, _new) \ +#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \ ({ \ union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ - "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ : [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high), "S" (_ptr) \ @@ -104,40 +104,25 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu(ptr, old, new); + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); } #define arch_cmpxchg64 arch_cmpxchg64 -#define __arch_cmpxchg64_emu_local(_ptr, _old, _new) \ -({ \ - union __u64_halves o = { .full = (_old), }, \ - n = { .full = (_new), }; \ - \ - asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ - "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ - : [ptr] "+m" (*(_ptr)), \ - "+a" (o.low), "+d" (o.high) \ - : "b" (n.low), "c" (n.high), "S" (_ptr) \ - : "memory"); \ - \ - o.full; \ -}) - static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu_local(ptr, old, new); + return __arch_cmpxchg64_emu(ptr, old, new, ,); } #define arch_cmpxchg64_local arch_cmpxchg64_local -#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new) \ +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \ ({ \ union __u64_halves o = { .full = *(_oldp), }, \ n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ - "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ CC_SET(e) \ : CC_OUT(e) (ret), \ [ptr] "+m" (*(_ptr)), \ @@ -153,34 +138,13 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu(ptr, oldp, new); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); } #define arch_try_cmpxchg64 arch_try_cmpxchg64 -#define __arch_try_cmpxchg64_emu_local(_ptr, _oldp, _new) \ -({ \ - union __u64_halves o = { .full = *(_oldp), }, \ - n = { .full = (_new), }; \ - bool ret; \ - \ - asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ - "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ - CC_SET(e) \ - : CC_OUT(e) (ret), \ - [ptr] "+m" (*(_ptr)), \ - "+a" (o.low), "+d" (o.high) \ - : "b" (n.low), "c" (n.high), "S" (_ptr) \ - : "memory"); \ - \ - if (unlikely(!ret)) \ - *(_oldp) = o.full; \ - \ - likely(ret); \ -}) - static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu_local(ptr, oldp, new); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,); } #define arch_try_cmpxchg64_local arch_try_cmpxchg64_local -- cgit v1.2.3 From 94af3a04e3f386d4f060d903826e85aa006ce252 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Apr 2024 14:00:38 +0200 Subject: locking/qspinlock/x86: Micro-optimize virt_spin_lock() Optimize virt_spin_lock() to use simpler and faster: atomic_try_cmpxchg(*ptr, &val, new) instead of: atomic_cmpxchg(*ptr, val, new) == val The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after the CMPXCHG. Also optimize retry loop a bit. atomic_try_cmpxchg() fails iff &lock->val != 0, so there is no need to load and compare the lock value again - cpu_relax() can be unconditinally called in this case. This allows us to generate optimized: 1f: ba 01 00 00 00 mov $0x1,%edx 24: 8b 03 mov (%rbx),%eax 26: 85 c0 test %eax,%eax 28: 75 63 jne 8d <...> 2a: f0 0f b1 13 lock cmpxchg %edx,(%rbx) 2e: 75 5d jne 8d <...> ... 8d: f3 90 pause 8f: eb 93 jmp 24 <...> instead of: 1f: ba 01 00 00 00 mov $0x1,%edx 24: 8b 03 mov (%rbx),%eax 26: 85 c0 test %eax,%eax 28: 75 13 jne 3d <...> 2a: f0 0f b1 13 lock cmpxchg %edx,(%rbx) 2e: 85 c0 test %eax,%eax 30: 75 f2 jne 24 <...> ... 3d: f3 90 pause 3f: eb e3 jmp 24 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240422120054.199092-1-ubizjak@gmail.com --- arch/x86/include/asm/qspinlock.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index cde8357bb226..a053c1293975 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -85,6 +85,8 @@ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { + int val; + if (!static_branch_likely(&virt_spin_lock_key)) return false; @@ -94,10 +96,13 @@ static inline bool virt_spin_lock(struct qspinlock *lock) * horrible lock 'holder' preemption issues. */ - do { - while (atomic_read(&lock->val) != 0) - cpu_relax(); - } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); + __retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } return true; } -- cgit v1.2.3 From 532453e7aa78f3962fb4d86caf40ff81ebf62160 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Apr 2024 17:17:35 +0200 Subject: locking/pvqspinlock/x86: Use _Q_LOCKED_VAL in PV_UNLOCK_ASM macro Use _Q_LOCKED_VAL instead of hardcoded $0x1 in PV_UNLOCK_ASM macro. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Boqun Feng Cc: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240422151752.53997-1-ubizjak@gmail.com --- arch/x86/include/asm/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 466af57b8ed6..0a985784be9b 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -40,7 +40,7 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define PV_UNLOCK_ASM \ FRAME_BEGIN \ "push %rdx\n\t" \ - "mov $0x1,%eax\n\t" \ + "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \ "xor %edx,%edx\n\t" \ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ "jne .slowpath\n\t" \ -- cgit v1.2.3