From 42fd8baab31f53bed2952485fcf0e92f244c5e55 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 17 Sep 2019 10:34:54 -0400 Subject: sched/core: Convert vcpu_is_preempted() from macro to an inline function Clang reports this warning: kernel/locking/osq_lock.c:25:19: warning: unused function 'node_cpu' [-Wunused-function] due to osq_lock() calling vcpu_is_preempted(node_cpu(node->prev))), but vcpu_is_preempted() is compiled away. Fix it by converting the dummy vcpu_is_preempted() from a macro to a proper static inline function. Signed-off-by: Qian Cai Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: rostedt@goodmis.org Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/1568730894-10483-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f0edee94834a..e2e91960d79f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1856,7 +1856,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) * running or not. */ #ifndef vcpu_is_preempted -# define vcpu_is_preempted(cpu) false +static inline bool vcpu_is_preempted(int cpu) +{ + return false; +} #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); -- cgit v1.2.3 From 3fbd7ee285b2bbc6eebd15a3c8786d9776a402a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:34 -0500 Subject: tasks: Add a count of task RCU users Add a count of the number of RCU users (currently 1) of the task struct so that we can later add the scheduler case and get rid of the very subtle task_rcu_dereference(), and just use rcu_dereference(). As suggested by Oleg have the count overlap rcu_head so that no additional space in task_struct is required. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87woebdplt.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- include/linux/sched/task.h | 1 + kernel/exit.c | 7 ++++++- kernel/fork.c | 7 +++---- 4 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index e2e91960d79f..8e43e54a02c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1147,7 +1147,10 @@ struct task_struct { struct tlbflush_unmap_batch tlb_ubc; - struct rcu_head rcu; + union { + refcount_t rcu_users; + struct rcu_head rcu; + }; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d90ed8f75f0..153a683646ac 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -120,6 +120,7 @@ static inline void put_task_struct(struct task_struct *t) } struct task_struct *task_rcu_dereference(struct task_struct **ptask); +void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; diff --git a/kernel/exit.c b/kernel/exit.c index 22ab6a4bdc51..3bcaec2ea3ba 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} void release_task(struct task_struct *p) { @@ -222,7 +227,7 @@ repeat: write_unlock_irq(&tasklist_lock); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) diff --git a/kernel/fork.c b/kernel/fork.c index 1d1cd06edbc1..7eefe338d7a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* - * One for us, one for whoever does the "release_task()" (usually - * parent) - */ + /* One for the user space visible state that goes away when reaped. */ + refcount_set(&tsk->rcu_users, 1); + /* One for the rcu users, and one for the scheduler */ refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; -- cgit v1.2.3 From 154abafc68bfb7c2ef2ad5308a3b2de8968c3f61 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:34:30 -0500 Subject: tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code Remove work arounds that were written before there was a grace period after tasks left the runqueue in finish_task_switch(). In particular now that there tasks exiting the runqueue exprience a RCU grace period none of the work performed by task_rcu_dereference() excpet the rcu_dereference() is necessary so replace task_rcu_dereference() with rcu_dereference(). Remove the code in rcuwait_wait_event() that checks to ensure the current task has not exited. It is no longer necessary as it is guaranteed that any running task will experience a RCU grace period after it leaves the run queueue. Remove the comment in rcuwait_wake_up() as it is no longer relevant. Ref: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery") Ref: 150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()") Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87lfurdpk9.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- include/linux/rcuwait.h | 20 +++----------- include/linux/sched/task.h | 1 - kernel/exit.c | 67 ---------------------------------------------- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 4 +-- 5 files changed, 7 insertions(+), 87 deletions(-) (limited to 'include') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 563290fc194f..75c97e4bbc57 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -6,16 +6,11 @@ /* * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner; where it is forbidden to use - * after exit_notify(). task_struct is not properly rcu protected, - * unless dealing with rcu-aware lists, ie: find_task_by_*(). + * task in an rcu-safe manner. * - * Alternatively we have task_rcu_dereference(), but the return - * semantics have different implications which would break the - * wakeup side. The only time @task is non-nil is when a user is - * blocked (or checking if it needs to) on a condition, and reset - * as soon as we know that the condition has succeeded and are - * awoken. + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. */ struct rcuwait { struct task_struct __rcu *task; @@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w); */ #define rcuwait_wait_event(w, condition) \ ({ \ - /* \ - * Complain if we are called after do_exit()/exit_notify(), \ - * as we cannot rely on the rcu critical region for the \ - * wakeup side. \ - */ \ - WARN_ON(current->exit_state); \ - \ rcu_assign_pointer((w)->task, current); \ for (;;) { \ /* \ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 153a683646ac..4b1c3b664f51 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -119,7 +119,6 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -struct task_struct *task_rcu_dereference(struct task_struct **ptask); void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT diff --git a/kernel/exit.c b/kernel/exit.c index 3bcaec2ea3ba..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -234,69 +234,6 @@ repeat: goto repeat; } -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ - struct sighand_struct *sighand; - struct task_struct *task; - - /* - * We need to verify that release_task() was not called and thus - * delayed_put_task_struct() can't run and drop the last reference - * before rcu_read_unlock(). We check task->sighand != NULL, - * but we can read the already freed and reused memory. - */ -retry: - task = rcu_dereference(*ptask); - if (!task) - return NULL; - - probe_kernel_address(&task->sighand, sighand); - - /* - * Pairs with atomic_dec_and_test() in put_task_struct(). If this task - * was already freed we can not miss the preceding update of this - * pointer. - */ - smp_rmb(); - if (unlikely(task != READ_ONCE(*ptask))) - goto retry; - - /* - * We've re-checked that "task == *ptask", now we have two different - * cases: - * - * 1. This is actually the same task/task_struct. In this case - * sighand != NULL tells us it is still alive. - * - * 2. This is another task which got the same memory for task_struct. - * We can't know this of course, and we can not trust - * sighand != NULL. - * - * In this case we actually return a random value, but this is - * correct. - * - * If we return NULL - we can pretend that we actually noticed that - * *ptask was updated when the previous task has exited. Or pretend - * that probe_slab_address(&sighand) reads NULL. - * - * If we return the new task (because sighand is not NULL for any - * reason) - this is fine too. This (new) task can't go away before - * another gp pass. - * - * And note: We could even eliminate the false positive if re-read - * task->sighand once again to avoid the falsely NULL. But this case - * is very unlikely so we don't care. - */ - if (!sighand) - return NULL; - - return task; -} - void rcuwait_wake_up(struct rcuwait *w) { struct task_struct *task; @@ -316,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w) */ smp_mb(); /* (B) */ - /* - * Avoid using task_rcu_dereference() magic as long as we are careful, - * see comment in rcuwait_wait_event() regarding ->exit_state. - */ task = rcu_dereference(w->task); if (task) wake_up_process(task); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3101c662426d..5bc23996ffae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1602,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..b14250a11608 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -71,7 +71,7 @@ static int membarrier_global_expedited(void) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { if (!fallback) @@ -150,7 +150,7 @@ static int membarrier_private_expedited(int flags) if (cpu == raw_smp_processor_id()) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == current->mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); -- cgit v1.2.3 From 2840cf02fae627860156737e83326df354ee4ec6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:01 -0400 Subject: sched/membarrier: Call sync_core only before usermode for same mm When the prev and next task's mm change, switch_mm() provides the core serializing guarantees before returning to usermode. The only case where an explicit core serialization is needed is when the scheduler keeps the same mm for prev and next. Suggested-by: Oleg Nesterov Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-4-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- include/linux/sched/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 4a7944078cc3..8557ec664213 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -362,6 +362,8 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + if (current->mm != mm) + return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; -- cgit v1.2.3 From 227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:02 -0400 Subject: sched/membarrier: Fix p->mm->membarrier_state racy load The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- fs/exec.c | 2 +- include/linux/mm_types.h | 14 +++- include/linux/sched/mm.h | 8 +-- kernel/sched/core.c | 4 +- kernel/sched/membarrier.c | 175 ++++++++++++++++++++++++++++++++++------------ kernel/sched/sched.h | 34 +++++++++ 6 files changed, 183 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index f7f6a140856a..555e93c7dec8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); active_mm = tsk->active_mm; + membarrier_exec_mmap(mm); tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); @@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - membarrier_execve(current); rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a7a1083b6fb..ec9bd3a6c827 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -383,6 +383,16 @@ struct mm_struct { unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEMBARRIER + /** + * @membarrier_state: Flags controlling membarrier behavior. + * + * This field is close to @pgd to hopefully fit in the same + * cache-line, which needs to be touched by switch_mm(). + */ + atomic_t membarrier_state; +#endif + /** * @mm_users: The number of users including userspace. * @@ -452,9 +462,7 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ struct core_state *core_state; /* coredumping support */ -#ifdef CONFIG_MEMBARRIER - atomic_t membarrier_state; -#endif + #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8557ec664213..e6770012db18 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -370,10 +370,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) sync_core_before_usermode(); } -static inline void membarrier_execve(struct task_struct *t) -{ - atomic_set(&t->mm->membarrier_state, 0); -} +extern void membarrier_exec_mmap(struct mm_struct *mm); + #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, @@ -382,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, { } #endif -static inline void membarrier_execve(struct task_struct *t) +static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84c71160beb1..2d9a3947bef4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev, else prev->active_mm = NULL; } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting - * rq->curr and returning to userspace. + * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7ccbd0e19626..070cf433bb9a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,6 +30,39 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; @@ -56,6 +89,7 @@ static int membarrier_global_expedited(void) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,17 +104,25 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); + if (p->flags & PF_KTHREAD) + continue; + + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags) else smp_call_function_single(cpu, ipi_mb, NULL, 1); } - rcu_read_unlock(); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(&rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if ((atomic_read(&mm->membarrier_state) & state) == state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (atomic_read(&mm->mm_users) != 1) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -911,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif -- cgit v1.2.3