From d16dbd1b8a29bb9f8aca2c2f3bd1a0d2b7621126 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:22 +0800 Subject: locking/lockdep: Update obsolete struct field description The lock_chain struct definition has outdated comment, update it and add struct member description. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-7-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6e2377e6c1d6..851d44fa5457 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -203,11 +203,17 @@ struct lock_list { struct lock_list *parent; }; -/* - * We record lock dependency chains, so that we can cache them: +/** + * struct lock_chain - lock dependency chain record + * + * @irq_context: the same as irq_context in held_lock below + * @depth: the number of held locks in this chain + * @base: the index in chain_hlocks for this chain + * @entry: the collided lock chains in lock_chain hash list + * @chain_key: the hash key of this lock_chain */ struct lock_chain { - /* see BUILD_BUG_ON()s in lookup_chain_cache() */ + /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; -- cgit v1.2.3 From e196e479a3b844da6e6e71e0d2a8694040cb4e52 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:23 +0800 Subject: locking/lockdep: Use lockdep_init_task for task initiation consistently Despite that there is a lockdep_init_task() which does nothing, lockdep initiates tasks by assigning lockdep fields and does so inconsistently. Fix this by using lockdep_init_task(). Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-8-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 7 ++++++- init/init_task.c | 2 ++ kernel/fork.c | 3 --- kernel/locking/lockdep.c | 11 ++++++++--- 4 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 851d44fa5457..5d05b8149f19 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -287,6 +287,8 @@ extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); +extern void lockdep_init_task(struct task_struct *task); + extern void lockdep_off(void); extern void lockdep_on(void); @@ -411,6 +413,10 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #else /* !CONFIG_LOCKDEP */ +static inline void lockdep_init_task(struct task_struct *task) +{ +} + static inline void lockdep_off(void) { } @@ -503,7 +509,6 @@ enum xhlock_context_t { { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} -static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT diff --git a/init/init_task.c b/init/init_task.c index c70ef656d0f4..1b15cb90d64f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -166,6 +166,8 @@ struct task_struct init_task .softirqs_enabled = 1, #endif #ifdef CONFIG_LOCKDEP + .lockdep_depth = 0, /* no locks held yet */ + .curr_chain_key = 0, .lockdep_recursion = 0, #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..735d0b4a89e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1984,9 +1984,6 @@ static __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP - p->lockdep_depth = 0; /* no locks held yet */ - p->curr_chain_key = 0; - p->lockdep_recursion = 0; lockdep_init_task(p); #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bc1efc12a8c5..b7d9c28ecf3b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -359,6 +359,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) return k0 | (u64)k1 << 32; } +void lockdep_init_task(struct task_struct *task) +{ + task->lockdep_depth = 0; /* no locks held yet */ + task->curr_chain_key = 0; + task->lockdep_recursion = 0; +} + void lockdep_off(void) { current->lockdep_recursion++; @@ -4589,9 +4596,7 @@ void lockdep_reset(void) int i; raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; + lockdep_init_task(current); memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; -- cgit v1.2.3 From f6ec8829ac9d59b637366c13038f15d6f6156fe1 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:24 +0800 Subject: locking/lockdep: Define INITIAL_CHAIN_KEY for chain keys to start with Chain keys are computed using Jenkins hash function, which needs an initial hash to start with. Dedicate a macro to make this clear and configurable. A later patch changes this initial chain key. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-9-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 1 + init/init_task.c | 2 +- kernel/locking/lockdep.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 5d05b8149f19..d4e69595dbd4 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -229,6 +229,7 @@ struct lock_chain { * bitfield and hitting the BUG in hlock_class(). */ #define MAX_LOCKDEP_KEYS ((1UL << MAX_LOCKDEP_KEYS_BITS) - 1) +#define INITIAL_CHAIN_KEY 0 struct held_lock { /* diff --git a/init/init_task.c b/init/init_task.c index 1b15cb90d64f..afa6ad795355 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -167,7 +167,7 @@ struct task_struct init_task #endif #ifdef CONFIG_LOCKDEP .lockdep_depth = 0, /* no locks held yet */ - .curr_chain_key = 0, + .curr_chain_key = INITIAL_CHAIN_KEY, .lockdep_recursion = 0, #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7d9c28ecf3b..9edf6f12b711 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -362,7 +362,7 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) void lockdep_init_task(struct task_struct *task) { task->lockdep_depth = 0; /* no locks held yet */ - task->curr_chain_key = 0; + task->curr_chain_key = INITIAL_CHAIN_KEY; task->lockdep_recursion = 0; } @@ -857,7 +857,7 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; static bool check_lock_chain_key(struct lock_chain *chain) { #ifdef CONFIG_PROVE_LOCKING - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int i; for (i = chain->base; i < chain->base + chain->depth; i++) @@ -2524,7 +2524,7 @@ static void print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) { struct held_lock *hlock; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int depth = curr->lockdep_depth; int i = get_first_held_lock(curr, hlock_next); @@ -2544,7 +2544,7 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne static void print_chain_keys_chain(struct lock_chain *chain) { int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int class_id; printk("depth: %u\n", chain->depth); @@ -2848,7 +2848,7 @@ static void check_chain_key(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -2872,7 +2872,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } @@ -3787,14 +3787,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, /* * How can we have a chain hash when we ain't got no keys?! */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } chain_key = iterate_chain_key(chain_key, class_idx); @@ -4636,7 +4636,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, return; recalc: - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); if (chain->depth && chain->chain_key == chain_key) -- cgit v1.2.3 From 01bb6f0af992a1e6b7797d92fd31a7864872e347 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:25 +0800 Subject: locking/lockdep: Change the range of class_idx in held_lock struct held_lock->class_idx is used to point to the class of the held lock. The index is shifted by 1 to make index 0 mean no class, which results in class index shifting back and forth but is not worth doing so. The reason is: (1) there will be no "no-class" held_lock to begin with, and (2) index 0 seems to be used for error checking, but if something wrong indeed happened, the index can't be counted on to distinguish it as that something won't set the class_idx to 0 on purpose to tell us it is wrong. Therefore, change the index to start from 0. This saves a lot of back-and-forth shifts and a class slot back to lock_classes. Since index 0 is now used for lock class, we change the initial chain key to -1 to avoid key collision, which is due to the fact that __jhash_mix(0, 0, 0) = 0. Actually, the initial chain key can be any arbitrary value other than 0. In addition, a bitmap is maintained to keep track of the used lock classes, and we check the validity of the held lock against that bitmap. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-10-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 14 ++++++------ kernel/locking/lockdep.c | 59 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d4e69595dbd4..30a0f81aa130 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -223,13 +223,8 @@ struct lock_chain { }; #define MAX_LOCKDEP_KEYS_BITS 13 -/* - * Subtract one because we offset hlock->class_idx by 1 in order - * to make 0 mean no class. This avoids overflowing the class_idx - * bitfield and hitting the BUG in hlock_class(). - */ -#define MAX_LOCKDEP_KEYS ((1UL << MAX_LOCKDEP_KEYS_BITS) - 1) -#define INITIAL_CHAIN_KEY 0 +#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) +#define INITIAL_CHAIN_KEY -1 struct held_lock { /* @@ -254,6 +249,11 @@ struct held_lock { u64 waittime_stamp; u64 holdtime_stamp; #endif + /* + * class_idx is zero-indexed; it points to the element in + * lock_classes this held lock instance belongs to. class_idx is in + * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. + */ unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; /* * The lock-stack is unified in that the lock chains of interrupt diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9edf6f12b711..3eecae315885 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -151,17 +151,28 @@ unsigned long nr_lock_classes; static #endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { - if (!hlock->class_idx) { + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ + barrier(); + + if (!test_bit(class_idx, lock_classes_in_use)) { /* * Someone passed in garbage, we give up. */ DEBUG_LOCKS_WARN_ON(1); return NULL; } - return lock_classes + hlock->class_idx - 1; + + /* + * At this point, if the passed hlock->class_idx is still garbage, + * we just have to live with it + */ + return lock_classes + class_idx; } #ifdef CONFIG_LOCK_STAT @@ -590,19 +601,22 @@ static void print_lock(struct held_lock *hlock) /* * We can be called locklessly through debug_show_all_locks() so be * extra careful, the hlock might have been released and cleared. + * + * If this indeed happens, lets pretend it does not hurt to continue + * to print the lock unless the hlock class_idx does not point to a + * registered class. The rationale here is: since we don't attempt + * to distinguish whether we are in this situation, if it just + * happened we can't count on class_idx to tell either. */ - unsigned int class_idx = hlock->class_idx; + struct lock_class *lock = hlock_class(hlock); - /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ - barrier(); - - if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + if (!lock) { printk(KERN_CONT "\n"); return; } printk(KERN_CONT "%p", hlock->instance); - print_lock_name(lock_classes + class_idx - 1); + print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -861,7 +875,7 @@ static bool check_lock_chain_key(struct lock_chain *chain) int i; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); /* * The 'unsigned long long' casts avoid that a compiler warning * is reported when building tools/lib/lockdep. @@ -1136,6 +1150,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } nr_lock_classes++; + __set_bit(class - lock_classes, lock_classes_in_use); debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; @@ -2550,7 +2565,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id + 1, chain_key); + chain_key = print_chain_key_iteration(class_id, chain_key); print_lock_name(lock_classes + class_id); printk("\n"); @@ -2601,7 +2616,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx - 1; + id = curr->held_locks[i].class_idx; if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -2684,7 +2699,7 @@ static inline int add_chain_cache(struct task_struct *curr, if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; + int lock_id = curr->held_locks[i].class_idx; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -2864,10 +2879,12 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } + /* - * Whoops ran out of static storage again? + * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is + * it registered lock class index? */ - if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) return; if (prev_hlock && (prev_hlock->irq_context != @@ -3715,7 +3732,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; - class_idx = class - lock_classes + 1; + class_idx = class - lock_classes; if (depth) { hlock = curr->held_locks + depth - 1; @@ -3777,9 +3794,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ /* - * Whoops, we did it again.. ran straight out of our static allocation. + * Whoops, we did it again.. class_idx is invalid. */ - if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) return 0; chain_key = curr->curr_chain_key; @@ -3894,7 +3911,7 @@ static int match_held_lock(const struct held_lock *hlock, if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; - if (hlock->class_idx == class - lock_classes + 1) + if (hlock->class_idx == class - lock_classes) return 1; } @@ -3988,7 +4005,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class - lock_classes; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -4638,7 +4655,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, recalc: chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); if (chain->depth && chain->chain_key == chain_key) return; /* Overwrite the chain key for concurrent RCU readers. */ @@ -4712,6 +4729,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); nr_lock_classes--; + __clear_bit(class - lock_classes, lock_classes_in_use); } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -5057,6 +5075,7 @@ void __init lockdep_init(void) printk(" memory used by lock dependency info: %zu kB\n", (sizeof(lock_classes) + + sizeof(lock_classes_in_use) + sizeof(classhash_table) + sizeof(list_entries) + sizeof(list_entries_in_use) + -- cgit v1.2.3 From 9255813d5841e158f033e0d83d455bffdae009a4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 22 May 2019 14:22:35 +0100 Subject: locking/atomic: Use s64 for atomic64 As a step towards making the atomic64 API use consistent types treewide, let's have the generic atomic64 implementation use s64 as the underlying type for atomic64_t, rather than long long, matching the generated headers. Otherwise, there should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Arnd Bergmann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: aou@eecs.berkeley.edu Cc: bp@alien8.de Cc: catalin.marinas@arm.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: ink@jurassic.park.msu.ru Cc: jhogan@kernel.org Cc: linux@armlinux.org.uk Cc: mattst88@gmail.com Cc: mpe@ellerman.id.au Cc: palmer@sifive.com Cc: paul.burton@mips.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: rth@twiddle.net Cc: tony.luck@intel.com Cc: vgupta@synopsys.com Link: https://lkml.kernel.org/r/20190522132250.26499-4-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/asm-generic/atomic64.h | 20 ++++++++++---------- lib/atomic64.c | 32 ++++++++++++++++---------------- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h index d7a15096fb3b..370f01d4450f 100644 --- a/include/asm-generic/atomic64.h +++ b/include/asm-generic/atomic64.h @@ -10,24 +10,24 @@ #include typedef struct { - long long counter; + s64 counter; } atomic64_t; #define ATOMIC64_INIT(i) { (i) } -extern long long atomic64_read(const atomic64_t *v); -extern void atomic64_set(atomic64_t *v, long long i); +extern s64 atomic64_read(const atomic64_t *v); +extern void atomic64_set(atomic64_t *v, s64 i); #define atomic64_set_release(v, i) atomic64_set((v), (i)) #define ATOMIC64_OP(op) \ -extern void atomic64_##op(long long a, atomic64_t *v); +extern void atomic64_##op(s64 a, atomic64_t *v); #define ATOMIC64_OP_RETURN(op) \ -extern long long atomic64_##op##_return(long long a, atomic64_t *v); +extern s64 atomic64_##op##_return(s64 a, atomic64_t *v); #define ATOMIC64_FETCH_OP(op) \ -extern long long atomic64_fetch_##op(long long a, atomic64_t *v); +extern s64 atomic64_fetch_##op(s64 a, atomic64_t *v); #define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op) ATOMIC64_FETCH_OP(op) @@ -46,11 +46,11 @@ ATOMIC64_OPS(xor) #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP -extern long long atomic64_dec_if_positive(atomic64_t *v); +extern s64 atomic64_dec_if_positive(atomic64_t *v); #define atomic64_dec_if_positive atomic64_dec_if_positive -extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n); -extern long long atomic64_xchg(atomic64_t *v, long long new); -extern long long atomic64_fetch_add_unless(atomic64_t *v, long long a, long long u); +extern s64 atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n); +extern s64 atomic64_xchg(atomic64_t *v, s64 new); +extern s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u); #define atomic64_fetch_add_unless atomic64_fetch_add_unless #endif /* _ASM_GENERIC_ATOMIC64_H */ diff --git a/lib/atomic64.c b/lib/atomic64.c index 7e6905751522..e98c85a99787 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -42,11 +42,11 @@ static inline raw_spinlock_t *lock_addr(const atomic64_t *v) return &atomic64_lock[addr & (NR_LOCKS - 1)].lock; } -long long atomic64_read(const atomic64_t *v) +s64 atomic64_read(const atomic64_t *v) { unsigned long flags; raw_spinlock_t *lock = lock_addr(v); - long long val; + s64 val; raw_spin_lock_irqsave(lock, flags); val = v->counter; @@ -55,7 +55,7 @@ long long atomic64_read(const atomic64_t *v) } EXPORT_SYMBOL(atomic64_read); -void atomic64_set(atomic64_t *v, long long i) +void atomic64_set(atomic64_t *v, s64 i) { unsigned long flags; raw_spinlock_t *lock = lock_addr(v); @@ -67,7 +67,7 @@ void atomic64_set(atomic64_t *v, long long i) EXPORT_SYMBOL(atomic64_set); #define ATOMIC64_OP(op, c_op) \ -void atomic64_##op(long long a, atomic64_t *v) \ +void atomic64_##op(s64 a, atomic64_t *v) \ { \ unsigned long flags; \ raw_spinlock_t *lock = lock_addr(v); \ @@ -79,11 +79,11 @@ void atomic64_##op(long long a, atomic64_t *v) \ EXPORT_SYMBOL(atomic64_##op); #define ATOMIC64_OP_RETURN(op, c_op) \ -long long atomic64_##op##_return(long long a, atomic64_t *v) \ +s64 atomic64_##op##_return(s64 a, atomic64_t *v) \ { \ unsigned long flags; \ raw_spinlock_t *lock = lock_addr(v); \ - long long val; \ + s64 val; \ \ raw_spin_lock_irqsave(lock, flags); \ val = (v->counter c_op a); \ @@ -93,11 +93,11 @@ long long atomic64_##op##_return(long long a, atomic64_t *v) \ EXPORT_SYMBOL(atomic64_##op##_return); #define ATOMIC64_FETCH_OP(op, c_op) \ -long long atomic64_fetch_##op(long long a, atomic64_t *v) \ +s64 atomic64_fetch_##op(s64 a, atomic64_t *v) \ { \ unsigned long flags; \ raw_spinlock_t *lock = lock_addr(v); \ - long long val; \ + s64 val; \ \ raw_spin_lock_irqsave(lock, flags); \ val = v->counter; \ @@ -130,11 +130,11 @@ ATOMIC64_OPS(xor, ^=) #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP -long long atomic64_dec_if_positive(atomic64_t *v) +s64 atomic64_dec_if_positive(atomic64_t *v) { unsigned long flags; raw_spinlock_t *lock = lock_addr(v); - long long val; + s64 val; raw_spin_lock_irqsave(lock, flags); val = v->counter - 1; @@ -145,11 +145,11 @@ long long atomic64_dec_if_positive(atomic64_t *v) } EXPORT_SYMBOL(atomic64_dec_if_positive); -long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n) +s64 atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { unsigned long flags; raw_spinlock_t *lock = lock_addr(v); - long long val; + s64 val; raw_spin_lock_irqsave(lock, flags); val = v->counter; @@ -160,11 +160,11 @@ long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n) } EXPORT_SYMBOL(atomic64_cmpxchg); -long long atomic64_xchg(atomic64_t *v, long long new) +s64 atomic64_xchg(atomic64_t *v, s64 new) { unsigned long flags; raw_spinlock_t *lock = lock_addr(v); - long long val; + s64 val; raw_spin_lock_irqsave(lock, flags); val = v->counter; @@ -174,11 +174,11 @@ long long atomic64_xchg(atomic64_t *v, long long new) } EXPORT_SYMBOL(atomic64_xchg); -long long atomic64_fetch_add_unless(atomic64_t *v, long long a, long long u) +s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned long flags; raw_spinlock_t *lock = lock_addr(v); - long long val; + s64 val; raw_spin_lock_irqsave(lock, flags); val = v->counter; -- cgit v1.2.3 From 3724921396dd1a07c93e3516b8d7c9ff570d17a9 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 22 May 2019 14:22:48 +0100 Subject: locking/atomic: Use s64 for atomic64_t on 64-bit Now that all architectures use 64 consistently as the base type for the atomic64 API, let's have the CONFIG_64BIT definition of atomic64_t use s64 as the underlying type for atomic64_t, rather than long, matching the generated headers. On architectures where atomic64_read(v) is READ_ONCE(v->counter), this patch will cause the return type of atomic64_read() to be s64. As of this patch, the atomic64 API can be relied upon to consistently return s64 where a value rather than boolean condition is returned. This should make code more robust, and simpler, allowing for the removal of casts previously required to ensure consistent types. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: aou@eecs.berkeley.edu Cc: arnd@arndb.de Cc: bp@alien8.de Cc: catalin.marinas@arm.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: ink@jurassic.park.msu.ru Cc: jhogan@kernel.org Cc: linux@armlinux.org.uk Cc: mattst88@gmail.com Cc: mpe@ellerman.id.au Cc: palmer@sifive.com Cc: paul.burton@mips.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: rth@twiddle.net Cc: tony.luck@intel.com Cc: vgupta@synopsys.com Link: https://lkml.kernel.org/r/20190522132250.26499-17-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/linux/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/types.h b/include/linux/types.h index 231114ae38f4..05030f608be3 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -174,7 +174,7 @@ typedef struct { #ifdef CONFIG_64BIT typedef struct { - long counter; + s64 counter; } atomic64_t; #endif -- cgit v1.2.3 From c2ba8a15f310d915f8748dd8324c91c82b12b5ff Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:30 +0200 Subject: jump_label: Batch updates if arch supports it If the architecture supports the batching of jump label updates, use it! An easy way to see the benefits of this patch is switching the schedstats on and off. For instance: -------------------------- %< ---------------------------- #!/bin/sh while [ true ]; do sysctl -w kernel.sched_schedstats=1 sleep 2 sysctl -w kernel.sched_schedstats=0 sleep 2 done -------------------------- >% ---------------------------- while watching the IPI count: -------------------------- %< ---------------------------- # watch -n1 "cat /proc/interrupts | grep Function" -------------------------- >% ---------------------------- With the current mode, it is possible to see +- 168 IPIs each 2 seconds, while with this patch the number of IPIs goes to 3 each 2 seconds. Regarding the performance impact of this patch set, I made two measurements: The time to update a key (the task that is causing the change) The time to run the int3 handler (the side effect on a thread that hits the code being changed) The schedstats static key was chosen as the key to being switched on and off. The reason being is that it is used in more than 56 places, in a hot path. The change in the schedstats static key will be done with the following command: while [ true ]; do sysctl -w kernel.sched_schedstats=1 usleep 500000 sysctl -w kernel.sched_schedstats=0 usleep 500000 done In this way, they key will be updated twice per second. To force the hit of the int3 handler, the system will also run a kernel compilation with two jobs per CPU. The test machine is a two nodes/24 CPUs box with an Intel Xeon processor @2.27GHz. Regarding the update part, on average, the regular kernel takes 57 ms to update the schedstats key, while the kernel with the batch updates takes just 1.4 ms on average. Although it seems to be too good to be true, it makes sense: the schedstats key is used in 56 places, so it was expected that it would take around 56 times to update the keys with the current implementation, as the IPIs are the most expensive part of the update. Regarding the int3 handler, the non-batch handler takes 45 ns on average, while the batch version takes around 180 ns. At first glance, it seems to be a high value. But it is not, considering that it is doing 56 updates, rather than one! It is taking four times more, only. This gain is possible because the patch uses a binary search in the vector: log2(56)=5.8. So, it was expected to have an overhead within four times. (voice of tv propaganda) But, that is not all! As the int3 handler keeps on for a shorter period (because the update part is on for a shorter time), the number of hits in the int3 handler decreased by 10%. The question then is: Is it worth paying the price of "135 ns" more in the int3 handler? Considering that, in this test case, we are saving the handling of 53 IPIs, that takes more than these 135 ns, it seems to be a meager price to be paid. Moreover, the test case was forcing the hit of the int3, in practice, it does not take that often. While the IPI takes place on all CPUs, hitting the int3 handler or not! For instance, in an isolated CPU with a process running in user-space (nohz_full use-case), the chances of hitting the int3 handler is barely zero, while there is no way to avoid the IPIs. By bounding the IPIs, we are improving a lot this scenario. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/acc891dbc2dbc9fd616dd680529a2337b1d1274c.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 3 +++ kernel/jump_label.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 3e113a1fa0f1..3526c0aee954 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -215,6 +215,9 @@ extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); +extern bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type); +extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ca00ac10d9b9..df3008419a1d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -414,6 +414,7 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init) return true; } +#ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, @@ -424,6 +425,28 @@ static void __jump_label_update(struct static_key *key, arch_jump_label_transform(entry, jump_label_type(entry)); } } +#else +static void __jump_label_update(struct static_key *key, + struct jump_entry *entry, + struct jump_entry *stop, + bool init) +{ + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { + + if (!jump_label_can_update(entry, init)) + continue; + + if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { + /* + * Queue is full: Apply the current queue and try again. + */ + arch_jump_label_transform_apply(); + BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); + } + } + arch_jump_label_transform_apply(); +} +#endif void __init jump_label_init(void) { -- cgit v1.2.3 From 9ffbe8ac05dbb4ab4a4836a55a47fc6be945a38f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 31 May 2019 13:06:51 +0300 Subject: locking/lockdep: Rename lockdep_assert_held_exclusive() -> lockdep_assert_held_write() All callers of lockdep_assert_held_exclusive() use it to verify the correct locking state of either a semaphore (ldisc_sem in tty, mmap_sem for perf events, i_rwsem of inode for dax) or rwlock by apparmor. Thus it makes sense to rename _exclusive to _write since that's the semantics callers care. Additionally there is already lockdep_assert_held_read(), which this new naming is more consistent with. No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190531100651.3969-1-nborisov@suse.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- drivers/infiniband/core/device.c | 2 +- drivers/tty/tty_ldisc.c | 8 ++++---- fs/dax.c | 2 +- include/linux/lockdep.h | 4 ++-- security/apparmor/label.c | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f315425d8468..cf91d80b8452 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2179,7 +2179,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(&mm->mmap_sem); + lockdep_assert_held_write(&mm->mmap_sem); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 29f7b15c81d9..d020bb4d03d5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -457,7 +457,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name) int rc; int i; - lockdep_assert_held_exclusive(&devices_rwsem); + lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index e38f104db174..fde8d4073e74 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -487,7 +487,7 @@ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld) static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld) { - lockdep_assert_held_exclusive(&tty->ldisc_sem); + lockdep_assert_held_write(&tty->ldisc_sem); WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags)); clear_bit(TTY_LDISC_OPEN, &tty->flags); if (ld->ops->close) @@ -509,7 +509,7 @@ static int tty_ldisc_failto(struct tty_struct *tty, int ld) struct tty_ldisc *disc = tty_ldisc_get(tty, ld); int r; - lockdep_assert_held_exclusive(&tty->ldisc_sem); + lockdep_assert_held_write(&tty->ldisc_sem); if (IS_ERR(disc)) return PTR_ERR(disc); tty->ldisc = disc; @@ -633,7 +633,7 @@ EXPORT_SYMBOL_GPL(tty_set_ldisc); */ static void tty_ldisc_kill(struct tty_struct *tty) { - lockdep_assert_held_exclusive(&tty->ldisc_sem); + lockdep_assert_held_write(&tty->ldisc_sem); if (!tty->ldisc) return; /* @@ -681,7 +681,7 @@ int tty_ldisc_reinit(struct tty_struct *tty, int disc) struct tty_ldisc *ld; int retval; - lockdep_assert_held_exclusive(&tty->ldisc_sem); + lockdep_assert_held_write(&tty->ldisc_sem); ld = tty_ldisc_get(tty, disc); if (IS_ERR(ld)) { BUG_ON(disc == N_TTY); diff --git a/fs/dax.c b/fs/dax.c index 2e48c7ebb973..bf8686d48b2d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1188,7 +1188,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, unsigned flags = 0; if (iov_iter_rw(iter) == WRITE) { - lockdep_assert_held_exclusive(&inode->i_rwsem); + lockdep_assert_held_write(&inode->i_rwsem); flags |= IOMAP_WRITE; } else { lockdep_assert_held(&inode->i_rwsem); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 30a0f81aa130..151d55711082 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -394,7 +394,7 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) -#define lockdep_assert_held_exclusive(l) do { \ +#define lockdep_assert_held_write(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ } while (0) @@ -479,7 +479,7 @@ struct lockdep_map { }; #define lockdep_is_held_type(l, r) (1) #define lockdep_assert_held(l) do { (void)(l); } while (0) -#define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0) +#define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 068e93c5d29c..59f1cc2557a7 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -76,7 +76,7 @@ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new) AA_BUG(!orig); AA_BUG(!new); - lockdep_assert_held_exclusive(&labels_set(orig)->lock); + lockdep_assert_held_write(&labels_set(orig)->lock); tmp = rcu_dereference_protected(orig->proxy->label, &labels_ns(orig)->lock); @@ -566,7 +566,7 @@ static bool __label_remove(struct aa_label *label, struct aa_label *new) AA_BUG(!ls); AA_BUG(!label); - lockdep_assert_held_exclusive(&ls->lock); + lockdep_assert_held_write(&ls->lock); if (new) __aa_proxy_redirect(label, new); @@ -603,7 +603,7 @@ static bool __label_replace(struct aa_label *old, struct aa_label *new) AA_BUG(!ls); AA_BUG(!old); AA_BUG(!new); - lockdep_assert_held_exclusive(&ls->lock); + lockdep_assert_held_write(&ls->lock); AA_BUG(new->flags & FLAG_IN_TREE); if (!label_is_stale(old)) @@ -640,7 +640,7 @@ static struct aa_label *__label_insert(struct aa_labelset *ls, AA_BUG(!ls); AA_BUG(!label); AA_BUG(labels_set(label) != ls); - lockdep_assert_held_exclusive(&ls->lock); + lockdep_assert_held_write(&ls->lock); AA_BUG(label->flags & FLAG_IN_TREE); /* Figure out where to put new node */ -- cgit v1.2.3 From c71fd893f614f205dbc050d60299cc5496491c19 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:00 -0400 Subject: locking/rwsem: Make owner available even if !CONFIG_RWSEM_SPIN_ON_OWNER The owner field in the rw_semaphore structure is used primarily for optimistic spinning. However, identifying the rwsem owner can also be helpful in debugging as well as tracing locking related issues when analyzing crash dump. The owner field may also store state information that can be important to the operation of the rwsem. So the owner field is now made a permanent member of the rw_semaphore structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-2-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 9 +++++---- kernel/locking/rwsem-xadd.c | 2 +- kernel/locking/rwsem.h | 23 ----------------------- lib/Kconfig.debug | 8 ++++---- 4 files changed, 10 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 2ea18a3def04..148983e21d47 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -34,12 +34,12 @@ */ struct rw_semaphore { atomic_long_t count; -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* - * Write owner. Used as a speculative check to see - * if the owner is running on the cpu. + * Write owner or one of the read owners. Can be used as a + * speculative check to see if the owner is running on the cpu. */ struct task_struct *owner; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; @@ -73,13 +73,14 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER -#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL +#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_INIT_COUNT(name), \ + .owner = NULL, \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0b1f77957240..c0500679fd2f 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -86,8 +86,8 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER sem->owner = NULL; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 64877f5294e3..eb9c8534299b 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -61,7 +61,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that * store tearing can't happen as optimistic spinners may read and use @@ -126,7 +125,6 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) * real owner or one of the real owners. The only exception is when the * unlock is done by up_read_non_owner(). */ -#define rwsem_clear_reader_owned rwsem_clear_reader_owned static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { unsigned long val = (unsigned long)current | RWSEM_READER_OWNED @@ -135,28 +133,7 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) cmpxchg_relaxed((unsigned long *)&sem->owner, val, RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); } -#endif - #else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} - -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -#ifndef rwsem_clear_reader_owned static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cbdfae379896..417bdd9e80fb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1095,7 +1095,7 @@ config PROVE_LOCKING select DEBUG_SPINLOCK select DEBUG_MUTEXES select DEBUG_RT_MUTEXES if RT_MUTEXES - select DEBUG_RWSEMS if RWSEM_SPIN_ON_OWNER + select DEBUG_RWSEMS select DEBUG_WW_MUTEX_SLOWPATH select DEBUG_LOCK_ALLOC select TRACE_IRQFLAGS @@ -1199,10 +1199,10 @@ config DEBUG_WW_MUTEX_SLOWPATH config DEBUG_RWSEMS bool "RW Semaphore debugging: basic checks" - depends on DEBUG_KERNEL && RWSEM_SPIN_ON_OWNER + depends on DEBUG_KERNEL help - This debugging feature allows mismatched rw semaphore locks and unlocks - to be detected and reported. + This debugging feature allows mismatched rw semaphore locks + and unlocks to be detected and reported. config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" -- cgit v1.2.3 From 00f3c5a3df2c1e3dab14d0dd2b71f852d46be97f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:07 -0400 Subject: locking/rwsem: Always release wait_lock before waking up tasks With the use of wake_q, we can do task wakeups without holding the wait_lock. There is one exception in the rwsem code, though. It is when the writer in the slowpath detects that there are waiters ahead but the rwsem is not held by a writer. This can lead to a long wait_lock hold time especially when a large number of readers are to be woken up. Remediate this situation by releasing the wait_lock before waking up tasks and re-acquiring it afterward. The rwsem_try_write_lock() function is also modified to read the rwsem count directly to avoid stale count value. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-9-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched/wake_q.h | 5 +++++ kernel/locking/rwsem.c | 31 +++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index ad826d2a4557..26a2013ac39c 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -51,6 +51,11 @@ static inline void wake_q_init(struct wake_q_head *head) head->lastp = &head->first; } +static inline bool wake_q_empty(struct wake_q_head *head) +{ + return head->first == WAKE_Q_TAIL; +} + extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index decda9fb8c6d..5532304406f7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -400,13 +400,14 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * If wstate is WRITER_HANDOFF, it will make sure that either the handoff * bit is set or the lock is acquired with handoff bit cleared. */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem, +static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, enum writer_wait_state wstate) { - long new; + long count, new; lockdep_assert_held(&sem->wait_lock); + count = atomic_long_read(&sem->count); do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); @@ -751,26 +752,25 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ? RWSEM_WAKE_READERS : RWSEM_WAKE_ANY, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* We need wake_q again below, reinitialize */ - wake_q_init(&wake_q); + if (!wake_q_empty(&wake_q)) { + /* + * We want to minimize wait_lock hold time especially + * when a large number of readers are to be woken up. + */ + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + wake_q_init(&wake_q); /* Used again, reinit */ + raw_spin_lock_irq(&sem->wait_lock); + } } else { - count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } wait: /* wait until we successfully acquire the lock */ set_current_state(state); while (true) { - if (rwsem_try_write_lock(count, sem, wstate)) + if (rwsem_try_write_lock(sem, wstate)) break; raw_spin_unlock_irq(&sem->wait_lock); @@ -811,7 +811,6 @@ wait: } raw_spin_lock_irq(&sem->wait_lock); - count = atomic_long_read(&sem->count); } __set_current_state(TASK_RUNNING); list_del(&waiter.list); -- cgit v1.2.3 From 02f1082b003a0cd48f48f12533d969cdbf1c2b63 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:10 -0400 Subject: locking/rwsem: Clarify usage of owner's nonspinaable bit Bit 1 of sem->owner (RWSEM_ANONYMOUSLY_OWNED) is used to designate an anonymous owner - readers or an anonymous writer. The setting of this anonymous bit is used as an indicator that optimistic spinning cannot be done on this rwsem. With the upcoming reader optimistic spinning patches, a reader-owned rwsem can be spinned on for a limit period of time. We still need this bit to indicate a rwsem is nonspinnable, but not setting this bit loses its meaning that the owner is known. So rename the bit to RWSEM_NONSPINNABLE to clarify its meaning. This patch also fixes a DEBUG_RWSEMS_WARN_ON() bug in __up_write(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-12-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 2 +- kernel/locking/rwsem.c | 43 +++++++++++++++++++++---------------------- 2 files changed, 22 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 148983e21d47..bb76e82398b2 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -50,7 +50,7 @@ struct rw_semaphore { }; /* - * Setting bit 1 of the owner field but not bit 0 will indicate + * Setting all bits of the owner field except bit 0 will indicate * that the rwsem is writer-owned with an unknown owner. */ #define RWSEM_OWNER_UNKNOWN ((struct task_struct *)-2L) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ded96023f4dc..180455b6b0d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -33,17 +33,18 @@ /* * The least significant 2 bits of the owner value has the following * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. + * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers + * - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem + * The rwsem is anonymously owned, i.e. the owner(s) cannot be + * readily determined. It can be reader owned or the owning writer + * is indeterminate. * * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. * * When a reader acquires a rwsem, it will also puts its task_struct * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will + * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will * largely be left untouched. So for a free or reader-owned rwsem, * the owner value may contain information about the last reader that * acquires the rwsem. The anonymous bit is set because that particular @@ -55,7 +56,8 @@ * a rwsem, but the overhead is simply too big. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) +#define RWSEM_NONSPINNABLE (1UL << 1) +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ @@ -132,7 +134,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; + | RWSEM_NONSPINNABLE; WRITE_ONCE(sem->owner, (struct task_struct *)val); } @@ -144,20 +146,12 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) /* * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. + * and steal the lock. * N.B. !owner is considered spinnable. */ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; + return !((unsigned long)owner & RWSEM_NONSPINNABLE); } #ifdef CONFIG_DEBUG_RWSEMS @@ -170,10 +164,10 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; + | RWSEM_NONSPINNABLE; if (READ_ONCE(sem->owner) == (struct task_struct *)val) cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); + RWSEM_READER_OWNED | RWSEM_NONSPINNABLE); } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) @@ -495,7 +489,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) struct task_struct *owner; bool ret = true; - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); if (need_resched()) return false; @@ -534,7 +528,7 @@ static inline enum owner_state rwsem_owner_state(unsigned long owner) if (!owner) return OWNER_NULL; - if (owner & RWSEM_ANONYMOUSLY_OWNED) + if (owner & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; if (owner & RWSEM_READER_OWNED) @@ -1043,7 +1037,12 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + /* + * sem->owner may differ from current if the ownership is transferred + * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. + */ + DEBUG_RWSEMS_WARN_ON((sem->owner != current) && + !((long)sem->owner & RWSEM_NONSPINNABLE), sem); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) -- cgit v1.2.3 From 94a9717b3c40e77a54e4afacd8f19a9a86bfeead Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:12 -0400 Subject: locking/rwsem: Make rwsem->owner an atomic_long_t The rwsem->owner contains not just the task structure pointer, it also holds some flags for storing the current state of the rwsem. Some of the flags may have to be atomically updated. To reflect the new reality, the owner is now changed to an atomic_long_t type. New helper functions are added to properly separate out the task structure pointer and the embedded flags. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-14-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/percpu-rwsem.h | 4 +- include/linux/rwsem.h | 11 ++-- kernel/locking/rwsem.c | 125 +++++++++++++++++++++++++++---------------- 3 files changed, 88 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 03cb4b6f842e..0a43830f1932 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -117,7 +117,7 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, lock_release(&sem->rw_sem.dep_map, 1, ip); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER if (!read) - sem->rw_sem.owner = RWSEM_OWNER_UNKNOWN; + atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN); #endif } @@ -127,7 +127,7 @@ static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER if (!read) - sem->rw_sem.owner = current; + atomic_long_set(&sem->rw_sem.owner, (long)current); #endif } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index bb76e82398b2..e401358c4e7e 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -35,10 +35,11 @@ struct rw_semaphore { atomic_long_t count; /* - * Write owner or one of the read owners. Can be used as a - * speculative check to see if the owner is running on the cpu. + * Write owner or one of the read owners as well flags regarding + * the current state of the rwsem. Can be used as a speculative + * check to see if the write owner is running on the cpu. */ - struct task_struct *owner; + atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif @@ -53,7 +54,7 @@ struct rw_semaphore { * Setting all bits of the owner field except bit 0 will indicate * that the rwsem is writer-owned with an unknown owner. */ -#define RWSEM_OWNER_UNKNOWN ((struct task_struct *)-2L) +#define RWSEM_OWNER_UNKNOWN (-2L) /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) @@ -80,7 +81,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #define __RWSEM_INITIALIZER(name) \ { __RWSEM_INIT_COUNT(name), \ - .owner = NULL, \ + .owner = ATOMIC_LONG_INIT(0), \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 985a03ad3f8c..fae557be8334 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -64,7 +64,7 @@ if (!debug_locks_silent && \ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ - (long)((sem)->owner), (long)current, \ + atomic_long_read(&(sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ debug_locks_off(); \ } while (0) @@ -114,12 +114,20 @@ */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - WRITE_ONCE(sem->owner, current); + atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - WRITE_ONCE(sem->owner, NULL); + atomic_long_set(&sem->owner, 0); +} + +/* + * Test the flags in the owner field. + */ +static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) +{ + return atomic_long_read(&sem->owner) & flags; } /* @@ -133,10 +141,9 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_NONSPINNABLE; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE; - WRITE_ONCE(sem->owner, (struct task_struct *)val); + atomic_long_set(&sem->owner, val); } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) @@ -145,13 +152,20 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) } /* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock. - * N.B. !owner is considered spinnable. + * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { - return !((unsigned long)owner & RWSEM_NONSPINNABLE); +#ifdef CONFIG_DEBUG_RWSEMS + /* + * Check the count to see if it is write-locked. + */ + long count = atomic_long_read(&sem->count); + + if (count & RWSEM_WRITER_MASK) + return false; +#endif + return rwsem_test_oflags(sem, RWSEM_READER_OWNED); } #ifdef CONFIG_DEBUG_RWSEMS @@ -163,11 +177,13 @@ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_NONSPINNABLE; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_NONSPINNABLE); + unsigned long val = atomic_long_read(&sem->owner); + + while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { + if (atomic_long_try_cmpxchg(&sem->owner, &val, + val & RWSEM_OWNER_FLAGS_MASK)) + return; + } } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) @@ -175,6 +191,28 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Return the real task structure pointer of the owner and the embedded + * flags in the owner. pflags must be non-NULL. + */ +static inline struct task_struct * +rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + *pflags = owner & RWSEM_OWNER_FLAGS_MASK; + return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); +} + /* * Guide to the rw_semaphore's count field. * @@ -208,7 +246,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); - sem->owner = NULL; + atomic_long_set(&sem->owner, 0L); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -511,9 +549,10 @@ static inline bool owner_on_cpu(struct task_struct *owner) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; + unsigned long flags; bool ret = true; - BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); + BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); if (need_resched()) { lockevent_inc(rwsem_opt_fail); @@ -522,11 +561,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) preempt_disable(); rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } + owner = rwsem_owner_flags(sem, &flags); + if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner))) + ret = false; rcu_read_unlock(); preempt_enable(); @@ -553,25 +590,26 @@ enum owner_state { }; #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) -static inline enum owner_state rwsem_owner_state(unsigned long owner) +static inline enum owner_state +rwsem_owner_state(struct task_struct *owner, unsigned long flags) { - if (!owner) - return OWNER_NULL; - - if (owner & RWSEM_NONSPINNABLE) + if (flags & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; - if (owner & RWSEM_READER_OWNED) + if (flags & RWSEM_READER_OWNED) return OWNER_READER; - return OWNER_WRITER; + return owner ? OWNER_WRITER : OWNER_NULL; } static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { - struct task_struct *tmp, *owner = READ_ONCE(sem->owner); - enum owner_state state = rwsem_owner_state((unsigned long)owner); + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; @@ -582,9 +620,9 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - tmp = READ_ONCE(sem->owner); - if (tmp != owner) { - state = rwsem_owner_state((unsigned long)tmp); + new = rwsem_owner_flags(sem, &new_flags); + if ((new != owner) || (new_flags != flags)) { + state = rwsem_owner_state(new, new_flags); break; } @@ -1001,8 +1039,7 @@ inline void __down_read(struct rw_semaphore *sem) if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { rwsem_set_reader_owned(sem); } @@ -1014,8 +1051,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem) &sem->count) & RWSEM_READ_FAILED_MASK)) { if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { rwsem_set_reader_owned(sem); } @@ -1084,7 +1120,7 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == @@ -1103,8 +1139,8 @@ static inline void __up_write(struct rw_semaphore *sem) * sem->owner may differ from current if the ownership is transferred * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. */ - DEBUG_RWSEMS_WARN_ON((sem->owner != current) && - !((long)sem->owner & RWSEM_NONSPINNABLE), sem); + DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && + !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) @@ -1125,7 +1161,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * read-locked region is ok to be re-ordered into the * write side. As such, rely on RELEASE semantics. */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); tmp = atomic_long_fetch_add_release( -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); @@ -1296,8 +1332,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); } EXPORT_SYMBOL(up_read_non_owner); -- cgit v1.2.3 From 9ed7d75b2f09d836e71d597cd5879abb1a44e7a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2019 09:48:51 +0100 Subject: x86/percpu: Relax smp_processor_id() Nadav reported that since this_cpu_read() became asm-volatile, many smp_processor_id() users generated worse code due to the extra constraints. However since smp_processor_id() is reading a stable value, we can use __this_cpu_read(). While this does reduce text size somewhat, this mostly results in code movement to .text.unlikely as a result of more/larger .cold. subfunctions. Less text on the hotpath is good for I$. $ ./compare.sh defconfig-build1 defconfig-build2 vmlinux.o setup_APIC_ibs 90 98 -12,+20 force_ibs_eilvt_setup 400 413 -57,+70 pci_serr_error 109 104 -54,+49 pci_serr_error 109 104 -54,+49 unknown_nmi_error 125 120 -76,+71 unknown_nmi_error 125 120 -76,+71 io_check_error 125 132 -97,+104 intel_thermal_interrupt 730 822 +92,+0 intel_init_thermal 951 945 -6,+0 generic_get_mtrr 301 294 -7,+0 generic_get_mtrr 301 294 -7,+0 generic_set_all 749 754 -44,+49 get_fixed_ranges 352 360 -41,+49 x86_acpi_suspend_lowlevel 369 363 -6,+0 check_tsc_sync_source 412 412 -71,+71 irq_migrate_all_off_this_cpu 662 674 -14,+26 clocksource_watchdog 748 748 -113,+113 __perf_event_account_interrupt 204 197 -7,+0 attempt_merge 1748 1741 -7,+0 intel_guc_send_ct 1424 1409 -15,+0 __fini_doorbell 235 231 -4,+0 bdw_set_cdclk 928 923 -5,+0 gen11_dsi_disable 1571 1556 -15,+0 gmbus_wait 493 488 -5,+0 md_make_request 376 369 -7,+0 __split_and_process_bio 543 536 -7,+0 delay_tsc 96 89 -7,+0 hsw_disable_pc8 696 691 -5,+0 tsc_verify_tsc_adjust 215 228 -22,+35 cpuidle_driver_unref 56 49 -7,+0 blk_account_io_completion 159 148 -11,+0 mtrr_wrmsr 95 99 -29,+33 __intel_wait_for_register_fw 401 419 +18,+0 cpuidle_driver_ref 43 36 -7,+0 cpuidle_get_driver 15 8 -7,+0 blk_account_io_done 535 528 -7,+0 irq_migrate_all_off_this_cpu 662 674 -14,+26 check_tsc_sync_source 412 412 -71,+71 irq_wait_for_poll 170 163 -7,+0 generic_end_io_acct 329 322 -7,+0 x86_acpi_suspend_lowlevel 369 363 -6,+0 nohz_balance_enter_idle 198 191 -7,+0 generic_start_io_acct 254 247 -7,+0 blk_account_io_start 341 334 -7,+0 perf_event_task_tick 682 675 -7,+0 intel_init_thermal 951 945 -6,+0 amd_e400_c1e_apic_setup 47 51 -28,+32 setup_APIC_eilvt 350 328 -22,+0 hsw_enable_pc8 1611 1605 -6,+0 total 12985947 12985892 -994,+939 Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 3 ++- include/linux/smp.h | 45 +++++++++++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index da545df207b2..0d3fe060a44f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -162,7 +162,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (this_cpu_read(cpu_number)) +#define raw_smp_processor_id() this_cpu_read(cpu_number) +#define __smp_processor_id() __this_cpu_read(cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/include/linux/smp.h b/include/linux/smp.h index a56f08ff3097..aa9e5e82d8c3 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -181,29 +181,46 @@ static inline int get_boot_cpu_id(void) #endif /* !SMP */ -/* - * smp_processor_id(): get the current CPU ID. +/** + * raw_processor_id() - get the current (unstable) CPU id + * + * For then you know what you are doing and need an unstable + * CPU id. + */ + +/** + * smp_processor_id() - get the current (stable) CPU id + * + * This is the normal accessor to the CPU id and should be used + * whenever possible. + * + * The CPU id is stable when: * - * if DEBUG_PREEMPT is enabled then we check whether it is - * used in a preemption-safe way. (smp_processor_id() is safe - * if it's used in a preemption-off critical section, or in - * a thread that is bound to the current CPU.) + * - IRQs are disabled; + * - preemption is disabled; + * - the task is CPU affine. * - * NOTE: raw_smp_processor_id() is for internal use only - * (smp_processor_id() is the preferred variant), but in rare - * instances it might also be used to turn off false positives - * (i.e. smp_processor_id() use that the debugging code reports but - * which use for some reason is legal). Don't use this to hack around - * the warning message, as your code might not work under PREEMPT. + * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN + * when smp_processor_id() is used when the CPU id is not stable. */ + +/* + * Allow the architecture to differentiate between a stable and unstable read. + * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a + * regular asm read for the stable. + */ +#ifndef __smp_processor_id +#define __smp_processor_id(x) raw_smp_processor_id(x) +#endif + #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else -# define smp_processor_id() raw_smp_processor_id() +# define smp_processor_id() __smp_processor_id() #endif -#define get_cpu() ({ preempt_disable(); smp_processor_id(); }) +#define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* -- cgit v1.2.3