From 6e2df0581f569038719cf2bc2b3baa3fcc83cab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 11:11:52 +0100 Subject: sched: Fix pick_next_task() vs 'change' pattern race Commit 67692435c411 ("sched: Rework pick_next_task() slow-path") inadvertly introduced a race because it changed a previously unexplored dependency between dropping the rq->lock and sched_class::put_prev_task(). The comments about dropping rq->lock, in for example newidle_balance(), only mentions the task being current and ->on_cpu being set. But when we look at the 'change' pattern (in for example sched_setnuma()): queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */ running = task_current(rq, p); /* rq->curr == p */ if (queued) dequeue_task(...); if (running) put_prev_task(...); /* change task properties */ if (queued) enqueue_task(...); if (running) set_next_task(...); It becomes obvious that if we do this after put_prev_task() has already been called on @p, things go sideways. This is exactly what the commit in question allows to happen when it does: prev->sched_class->put_prev_task(rq, prev, rf); if (!rq->nr_running) newidle_balance(rq, rf); The newidle_balance() call will drop rq->lock after we've called put_prev_task() and that allows the above 'change' pattern to interleave and mess up the state. Furthermore, it turns out we lost the RT-pull when we put the last DL task. Fix both problems by extracting the balancing from put_prev_task() and doing a multi-class balance() pass before put_prev_task(). Fixes: 67692435c411 ("sched: Rework pick_next_task() slow-path") Reported-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Tested-by: Quentin Perret Tested-by: Valentin Schneider --- kernel/sched/idle.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched/idle.c') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..f65ef1e2f204 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -365,6 +365,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static int +balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return WARN_ON_ONCE(1); +} #endif /* @@ -375,7 +381,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } @@ -460,6 +466,7 @@ const struct sched_class idle_sched_class = { .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP + .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif -- cgit v1.2.3