summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/sched/core.c31
-rw-r--r--kernel/sched/cpudeadline.c24
-rw-r--r--kernel/sched/cputime.c46
-rw-r--r--kernel/sched/deadline.c118
-rw-r--r--kernel/sched/fair.c77
-rw-r--r--kernel/sched/idle.c11
-rw-r--r--kernel/sched/isolation.c3
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/pelt.h5
-rw-r--r--kernel/sched/psi.c113
-rw-r--r--kernel/sched/rt.c4
-rw-r--r--kernel/sched/sched.h41
-rw-r--r--kernel/sched/stop_task.c12
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/sysctl.c14
16 files changed, 306 insertions, 205 deletions
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 132f84a5fde3..1d9e2fdfd67a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task, cpu_all_mask);
+ set_cpus_allowed_ptr(task,
+ housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
@@ -608,7 +610,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e15543cb8481..15c980af63db 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1404,20 +1406,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
- const struct sched_class *class;
-
- if (p->sched_class == rq->curr->sched_class) {
+ if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- } else {
- for_each_class(class) {
- if (class == rq->curr->sched_class)
- break;
- if (class == p->sched_class) {
- resched_curr(rq);
- break;
- }
- }
- }
+ else if (p->sched_class > rq->curr->sched_class)
+ resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
@@ -4029,8 +4021,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely((prev->sched_class == &idle_sched_class ||
- prev->sched_class == &fair_sched_class) &&
+ if (likely(prev->sched_class <= &fair_sched_class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5871,7 +5862,7 @@ again:
if (task_running(p_rq, p) || p->state)
goto out_unlock;
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
/*
@@ -6705,6 +6696,14 @@ void __init sched_init(void)
unsigned long ptr = 0;
int i;
+ /* Make sure the linker didn't screw up */
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
+ &fair_sched_class + 1 != &rt_sched_class ||
+ &rt_sched_class + 1 != &dl_sched_class);
+#ifdef CONFIG_SMP
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+#endif
+
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012572ec..8cb06c8c7eb1 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = capacity_orig_of(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..5a55d2300452 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
goto update;
}
- stime = scale_stime(stime, rtime, stime + utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
update:
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f63f337c7147..3862a28cd05d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
+ int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
+
+ if (cpumask_subset(rd->span, cpu_active_mask))
+ return cpumask_weight(rd->span);
+
+ cpus = 0;
+
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
+
+static inline unsigned long __dl_bw_capacity(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ unsigned long cap = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cap += capacity_orig_of(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+ capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ return __dl_bw_capacity(i);
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i)
{
return 1;
}
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+ return SCHED_CAPACITY_SCALE;
+}
#endif
static inline
@@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1604,6 +1643,7 @@ static int
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
+ bool select_rq;
struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE)
@@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -2430,8 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
}
}
-const struct sched_class dl_sched_class = {
- .next = &rt_sched_class,
+const struct sched_class dl_sched_class
+ __attribute__((section("__dl_sched_class"))) = {
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -2551,11 +2600,12 @@ void sched_dl_do_global(void)
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2635,6 +2687,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
}
/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+
+/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
@@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ u64 period, max, min;
+
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr)
attr->sched_period & (1ULL << 63))
return false;
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
/* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
+ if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
return true;
}
@@ -2715,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
+ unsigned long flags, cap;
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
- int cpus, ret;
- unsigned long flags;
+ int ret;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ cap = dl_bw_capacity(dest_cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
if (overflow) {
ret = -EBUSY;
} else {
@@ -2737,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
* We will free resources in the source root_domain
* later on (see set_cpus_allowed_dl()).
*/
+ int cpus = dl_bw_cpus(dest_cpu);
+
__dl_add(dl_b, p->dl.dl_bw, cpus);
ret = 0;
}
@@ -2769,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
bool dl_cpu_busy(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow;
- int cpus;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ cap = dl_bw_capacity(cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 658aa7a2ae6f..6fab1d17c575 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3094,7 +3094,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#ifdef CONFIG_SMP
do {
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
@@ -3440,16 +3440,18 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
@@ -3463,16 +3465,18 @@ static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
@@ -3500,7 +3504,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
@@ -3646,7 +3650,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq->removed.nr) {
unsigned long r;
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
@@ -3701,7 +3705,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3922,6 +3926,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
/*
@@ -3952,6 +3958,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+ trace_sched_util_est_cfs_tp(cfs_rq);
+
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4017,6 +4025,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
WRITE_ONCE(p->se.avg.util_est, ue);
+
+ trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
@@ -5614,14 +5624,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
-dequeue_throttle:
- if (!se)
- sub_nr_running(rq, 1);
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, 1);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
+dequeue_throttle:
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -7157,7 +7167,7 @@ static void yield_task_fair(struct rq *rq)
set_skip_buddy(se);
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -8038,7 +8048,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -8070,7 +8080,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
@@ -10016,7 +10026,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -10337,6 +10352,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -10357,14 +10380,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}
@@ -11107,8 +11122,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+const struct sched_class fair_sched_class
+ __attribute__((section("__fair_sched_class"))) = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1ae95b9150d3..6bf34986f45c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -453,11 +453,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
BUG();
}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_idle(struct rq *rq)
{
}
@@ -465,8 +460,8 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
+const struct sched_class idle_sched_class
+ __attribute__((section("__idle_sched_class"))) = {
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
@@ -486,8 +481,6 @@ const struct sched_class idle_sched_class = {
.task_tick = task_tick_idle,
- .get_rr_interval = get_rr_interval_idle,
-
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 808244f3ddd9..5a6ea03f9882 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+ HK_FLAG_MISC | HK_FLAG_KTHREAD;
return housekeeping_setup(str, flags);
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b4b1ff96642f..11bea3b08115 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -83,8 +83,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -264,7 +262,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index eb034d9f024d..795e43e02afc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return LOAD_AVG_MAX - 1024 + avg->period_contrib;
+}
+
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f45cdb6463b..e53b711bd643 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
}
void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
+/* Schedule polling if it's not already scheduled. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
{
- struct kthread_worker *kworker;
+ struct task_struct *task;
- /* Do not reschedule if already scheduled */
- if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ /*
+ * Do not reschedule if already scheduled.
+ * Possible race with a timer scheduled after this check but before
+ * mod_timer below can be tolerated because group->polling_next_update
+ * will keep updates on schedule.
+ */
+ if (timer_pending(&group->poll_timer))
return;
rcu_read_lock();
- kworker = rcu_dereference(group->poll_kworker);
+ task = rcu_dereference(group->poll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
- if (likely(kworker))
- kthread_queue_delayed_work(kworker, &group->poll_work, delay);
- else
- atomic_set(&group->poll_scheduled, 0);
+ if (likely(task))
+ mod_timer(&group->poll_timer, jiffies + delay);
rcu_read_unlock();
}
-static void psi_poll_work(struct kthread_work *work)
+static void psi_poll_work(struct psi_group *group)
{
- struct kthread_delayed_work *dwork;
- struct psi_group *group;
u32 changed_states;
u64 now;
- dwork = container_of(work, struct kthread_delayed_work, work);
- group = container_of(dwork, struct psi_group, poll_work);
-
- atomic_set(&group->poll_scheduled, 0);
-
mutex_lock(&group->trigger_lock);
now = sched_clock();
@@ -623,6 +613,35 @@ out:
mutex_unlock(&group->trigger_lock);
}
+static int psi_poll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+ struct sched_param param = {
+ .sched_priority = 1,
+ };
+
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+
+ while (true) {
+ wait_event_interruptible(group->poll_wait,
+ atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_poll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = from_timer(group, t, poll_timer);
+
+ atomic_set(&group->poll_wakeup, 1);
+ wake_up_interruptible(&group->poll_wait);
+}
+
static void record_times(struct psi_group_cpu *groupc, int cpu,
bool memstall_tick)
{
@@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_lock(&group->trigger_lock);
- if (!rcu_access_pointer(group->poll_kworker)) {
- struct sched_param param = {
- .sched_priority = 1,
- };
- struct kthread_worker *kworker;
+ if (!rcu_access_pointer(group->poll_task)) {
+ struct task_struct *task;
- kworker = kthread_create_worker(0, "psimon");
- if (IS_ERR(kworker)) {
+ task = kthread_create(psi_poll_worker, group, "psimon");
+ if (IS_ERR(task)) {
kfree(t);
mutex_unlock(&group->trigger_lock);
- return ERR_CAST(kworker);
+ return ERR_CAST(task);
}
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
- kthread_init_delayed_work(&group->poll_work,
- psi_poll_work);
- rcu_assign_pointer(group->poll_kworker, kworker);
+ atomic_set(&group->poll_wakeup, 0);
+ init_waitqueue_head(&group->poll_wait);
+ wake_up_process(task);
+ timer_setup(&group->poll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->poll_task, task);
}
list_add(&t->node, &group->triggers);
@@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref)
{
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
struct psi_group *group = t->group;
- struct kthread_worker *kworker_to_destroy = NULL;
+ struct task_struct *task_to_destroy = NULL;
if (static_branch_likely(&psi_disabled))
return;
@@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref)
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->poll_min_period = period;
- /* Destroy poll_kworker when the last trigger is destroyed */
+ /* Destroy poll_task when the last trigger is destroyed */
if (group->poll_states == 0) {
group->polling_until = 0;
- kworker_to_destroy = rcu_dereference_protected(
- group->poll_kworker,
+ task_to_destroy = rcu_dereference_protected(
+ group->poll_task,
lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
}
}
@@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref)
/*
* Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * poll_task RCUs to complete their read-side critical sections
+ * before destroying the trigger and optionally the poll_task
*/
synchronize_rcu();
/*
* Destroy the kworker after releasing trigger_lock to prevent a
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
- if (kworker_to_destroy) {
+ if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_kworker.
+ * can no longer be found through group->poll_task.
* But it might have been already scheduled before
* that - deschedule it cleanly before destroying it.
*/
- kthread_cancel_delayed_work_sync(&group->poll_work);
- atomic_set(&group->poll_scheduled, 0);
-
- kthread_destroy_worker(kworker_to_destroy);
+ del_timer_sync(&group->poll_timer);
+ kthread_stop(task_to_destroy);
}
kfree(t);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f395ddb75f38..f215eea6a966 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2429,8 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
+const struct sched_class rt_sched_class
+ __attribute__((section("__rt_sched_class"))) = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..5aa6661ecaf1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -310,11 +310,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
__dl_update(dl_b, -((s32)tsk_bw / cpus));
}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
+ u64 old_bw, u64 new_bw)
{
return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the CPU original capacity of the
+ * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
+ * task and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+ return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
}
extern void init_dl_bw(struct dl_bw *dl_b);
@@ -1739,7 +1754,6 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
struct sched_class {
- const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
@@ -1748,7 +1762,7 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
- bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
@@ -1796,7 +1810,7 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
-};
+} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
@@ -1810,17 +1824,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
-#ifdef CONFIG_SMP
-#define sched_class_highest (&stop_sched_class)
-#else
-#define sched_class_highest (&dl_sched_class)
-#endif
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __begin_sched_classes[];
+extern struct sched_class __end_sched_classes[];
+
+#define sched_class_highest (__end_sched_classes - 1)
+#define sched_class_lowest (__begin_sched_classes - 1)
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class = class->next)
+ for (class = (_from); class != (_to); class--)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, NULL)
+ for_class_range(class, sched_class_highest, sched_class_lowest)
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4c9e9975684f..394bc8126a1e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
BUG(); /* how!?, what priority? */
}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_stop(struct rq *rq)
{
}
@@ -115,8 +109,8 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class = {
- .next = &dl_sched_class,
+const struct sched_class stop_sched_class
+ __attribute__((section("__stop_sched_class"))) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
@@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = {
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..9079d865a935 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..4aea67d3d552 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1780,6 +1780,20 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_rt_handler,
},
{
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_rr_timeslice_ms",
.data = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),