summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c174
1 files changed, 69 insertions, 105 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe365c9a08e9..454adf9f8180 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -17,6 +17,8 @@
#include "../workqueue_internal.h"
#include "../smpboot.h"
+#include "pelt.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -45,14 +47,6 @@ const_debug unsigned int sysctl_sched_features =
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-
-/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
*/
@@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
- sched_rt_avg_update(rq, irq_delta + steal);
+ update_irq_load_avg(rq, irq_delta + steal);
#endif
}
@@ -412,8 +406,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* its already queued (either by us or someone else) and will get the
* wakeup due to that.
*
- * This cmpxchg() implies a full barrier, which pairs with the write
- * barrier implied by the wakeup in wake_up_q().
+ * This cmpxchg() executes a full barrier, which pairs with the full
+ * barrier executed by the wakeup in wake_up_q().
*/
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
return;
@@ -441,8 +435,8 @@ void wake_up_q(struct wake_q_head *head)
task->wake_q.next = NULL;
/*
- * wake_up_process() implies a wmb() to pair with the queueing
- * in wake_q_add() so as not to miss wakeups.
+ * wake_up_process() executes a full barrier, which pairs with
+ * the queueing in wake_q_add() so as not to miss wakeups.
*/
wake_up_process(task);
put_task_struct(task);
@@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-
-void sched_avg_update(struct rq *rq)
-{
- s64 period = sched_avg_period();
-
- while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (rq->age_stamp));
- rq->age_stamp += period;
- rq->rt_avg /= 2;
- }
-}
-
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
@@ -1280,16 +1258,17 @@ unlock:
/*
* Cross migrate two tasks
*/
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
+int migrate_swap(struct task_struct *cur, struct task_struct *p,
+ int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
- .src_cpu = task_cpu(cur),
+ .src_cpu = curr_cpu,
.dst_task = p,
- .dst_cpu = task_cpu(p),
+ .dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
@@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
out:
return ret;
}
+#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
@@ -1879,8 +1859,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* rq(c1)->lock (if not at the same time, then in that order).
* C) LOCK of the rq(c1)->lock scheduling in task
*
- * Transitivity guarantees that B happens after A and C after B.
- * Note: we only require RCpc transitivity.
+ * Release/acquire chaining guarantees that B happens after A and C after B.
* Note: the CPU doing B need not be c0 or c1
*
* Example:
@@ -1942,16 +1921,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* UNLOCK rq(0)->lock
*
*
- * However; for wakeups there is a second guarantee we must provide, namely we
- * must observe the state that lead to our wakeup. That is, not only must our
- * task observe its own prior state, it must also observe the stores prior to
- * its wakeup.
- *
- * This means that any means of doing remote wakeups must order the CPU doing
- * the wakeup against the CPU the task is going to end up running on. This,
- * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
- *
+ * However, for wakeups there is a second guarantee we must provide, namely we
+ * must ensure that CONDITION=1 done by the caller can not be reordered with
+ * accesses to the task state; see try_to_wake_up() and set_current_state().
*/
/**
@@ -1967,6 +1939,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* Atomic against schedule() which would dequeue a task, also see
* set_current_state().
*
+ * This function executes a full memory barrier before accessing the task
+ * state; see set_current_state().
+ *
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
@@ -1998,21 +1973,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* be possible to, falsely, observe p->on_rq == 0 and get stuck
* in smp_cond_load_acquire() below.
*
- * sched_ttwu_pending() try_to_wake_up()
- * [S] p->on_rq = 1; [L] P->state
- * UNLOCK rq->lock -----.
- * \
- * +--- RMB
- * schedule() /
- * LOCK rq->lock -----'
- * UNLOCK rq->lock
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
*
* [task p]
- * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
*
- * Pairs with the UNLOCK+LOCK on rq->lock from the
- * last wakeup of our task and the schedule that got our task
- * current.
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
@@ -2026,15 +2000,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* One must be running (->on_cpu == 1) in order to remove oneself
* from the runqueue.
*
- * [S] ->on_cpu = 1; [L] ->on_rq
- * UNLOCK rq->lock
- * RMB
- * LOCK rq->lock
- * [S] ->on_rq = 0; [L] ->on_cpu
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
*
- * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
- * from the consecutive calls to schedule(); the first switching to our
- * task, the second putting it to sleep.
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
*/
smp_rmb();
@@ -2140,8 +2116,7 @@ out:
*
* Return: 1 if the process was woken up, 0 if it was already running.
*
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * This function executes a full memory barrier before accessing the task state.
*/
int wake_up_process(struct task_struct *p)
{
@@ -2317,7 +2292,6 @@ static inline void init_schedstats(void) {}
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
- int cpu = get_cpu();
__sched_fork(clone_flags, p);
/*
@@ -2353,14 +2327,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- if (dl_prio(p->prio)) {
- put_cpu();
+ if (dl_prio(p->prio))
return -EAGAIN;
- } else if (rt_prio(p->prio)) {
+ else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
- } else {
+ else
p->sched_class = &fair_sched_class;
- }
init_entity_runnable_average(&p->se);
@@ -2376,7 +2348,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
- __set_task_cpu(p, cpu);
+ __set_task_cpu(p, smp_processor_id());
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2393,8 +2365,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
-
- put_cpu();
return 0;
}
@@ -5714,13 +5684,6 @@ void set_rq_offline(struct rq *rq)
}
}
-static void set_cpu_rq_start_time(unsigned int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- rq->age_stamp = sched_clock_cpu(cpu);
-}
-
/*
* used to mark begin/end of suspend/resume:
*/
@@ -5774,6 +5737,18 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * The sched_smt_present static key needs to be evaluated on every
+ * hotplug event because at boot time SMT might be disabled when
+ * the number of booted CPUs is limited.
+ *
+ * If then later a sibling gets hotplugged, then the key would stay
+ * off and SMT scheduling would never be functional.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
+ static_branch_enable_cpuslocked(&sched_smt_present);
+#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -5838,7 +5813,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
- set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -5871,22 +5845,6 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
-#ifdef CONFIG_SCHED_SMT
-DEFINE_STATIC_KEY_FALSE(sched_smt_present);
-
-static void sched_init_smt(void)
-{
- /*
- * We've enumerated all CPUs and will assume that if any CPU
- * has SMT siblings, CPU0 will too.
- */
- if (cpumask_weight(cpu_smt_mask(0)) > 1)
- static_branch_enable(&sched_smt_present);
-}
-#else
-static inline void sched_init_smt(void) { }
-#endif
-
void __init sched_init_smp(void)
{
sched_init_numa();
@@ -5908,8 +5866,6 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
- sched_init_smt();
-
sched_smp_initialized = true;
}
@@ -5954,7 +5910,6 @@ void __init sched_init(void)
int i, j;
unsigned long alloc_size = 0, ptr;
- sched_clock_init();
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6106,7 +6061,6 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
- set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();
@@ -6785,6 +6739,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+ if (schedstat_enabled() && tg != &root_task_group) {
+ u64 ws = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+
+ seq_printf(sf, "wait_sum %llu\n", ws);
+ }
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */