summaryrefslogtreecommitdiff
path: root/kernel/sched/deadline.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-03 23:08:04 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-03 23:08:04 +0300
commit9bd42183b951051f73de121f7ee17091e7d26fbb (patch)
treec85c680126a0548a3c5f083e35f5b1cadce636f6 /kernel/sched/deadline.c
parent7447d56217e215e50317f308aee1ed293ac4f749 (diff)
parent72298e5c92c50edd8cb7cfda4519483ce65fa166 (diff)
downloadlinux-9bd42183b951051f73de121f7ee17091e7d26fbb.tar.xz
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - Add the SYSTEM_SCHEDULING bootup state to move various scheduler debug checks earlier into the bootup. This turns silent and sporadically deadly bugs into nice, deterministic splats. Fix some of the splats that triggered. (Thomas Gleixner) - A round of restructuring and refactoring of the load-balancing and topology code (Peter Zijlstra) - Another round of consolidating ~20 of incremental scheduler code history: this time in terms of wait-queue nomenclature. (I didn't get much feedback on these renaming patches, and we can still easily change any names I might have misplaced, so if anyone hates a new name, please holler and I'll fix it.) (Ingo Molnar) - sched/numa improvements, fixes and updates (Rik van Riel) - Another round of x86/tsc scheduler clock code improvements, in hope of making it more robust (Peter Zijlstra) - Improve NOHZ behavior (Frederic Weisbecker) - Deadline scheduler improvements and fixes (Luca Abeni, Daniel Bristot de Oliveira) - Simplify and optimize the topology setup code (Lauro Ramos Venancio) - Debloat and decouple scheduler code some more (Nicolas Pitre) - Simplify code by making better use of llist primitives (Byungchul Park) - ... plus other fixes and improvements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (103 commits) sched/cputime: Refactor the cputime_adjust() code sched/debug: Expose the number of RT/DL tasks that can migrate sched/numa: Hide numa_wake_affine() from UP build sched/fair: Remove effective_load() sched/numa: Implement NUMA node level wake_affine() sched/fair: Simplify wake_affine() for the single socket case sched/numa: Override part of migrate_degrades_locality() when idle balancing sched/rt: Move RT related code from sched/core.c to sched/rt.c sched/deadline: Move DL related code from sched/core.c to sched/deadline.c sched/cpuset: Only offer CONFIG_CPUSETS if SMP is enabled sched/fair: Spare idle load balancing on nohz_full CPUs nohz: Move idle balancer registration to the idle path sched/loadavg: Generalize "_idle" naming to "_nohz" sched/core: Drop the unused try_get_task_struct() helper function sched/fair: WARN() and refuse to set buddy when !se->on_rq sched/debug: Fix SCHED_WARN_ON() to return a value on !CONFIG_SCHED_DEBUG as well sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming sched/wait: Move bit_wait_table[] and related functionality from sched/core.c to sched/wait_bit.c sched/wait: Split out the wait_bit*() APIs from <linux/wait.h> into <linux/wait_bit.h> sched/wait: Re-adjust macro line continuation backslashes in <linux/wait.h> ...
Diffstat (limited to 'kernel/sched/deadline.c')
-rw-r--r--kernel/sched/deadline.c894
1 files changed, 851 insertions, 43 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..a84299f44b5d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,7 @@
#include "sched.h"
#include <linux/slab.h>
+#include <uapi/linux/sched/types.h>
struct dl_bandwidth def_dl_bandwidth;
@@ -43,6 +44,254 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+#ifdef CONFIG_SMP
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+static inline
+void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+static inline
+void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ if (dl_rq->running_bw > old)
+ dl_rq->running_bw = 0;
+}
+
+static inline
+void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+}
+
+static inline
+void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ if (dl_rq->this_bw > old)
+ dl_rq->this_bw = 0;
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ struct rq *rq;
+
+ if (task_on_rq_queued(p))
+ return;
+
+ rq = task_rq(p);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(new_bw, &rq->dl);
+}
+
+/*
+ * The utilization of a task cannot be immediately removed from
+ * the rq active utilization (running_bw) when the task blocks.
+ * Instead, we have to wait for the so called "0-lag time".
+ *
+ * If a task blocks before the "0-lag time", a timer (the inactive
+ * timer) is armed, and running_bw is decreased when the timer
+ * fires.
+ *
+ * If the task wakes up again before the inactive timer fires,
+ * the timer is cancelled, whereas if the task wakes up after the
+ * inactive timer fired (and running_bw has been decreased) the
+ * task's utilization has to be added to running_bw again.
+ * A flag in the deadline scheduling entity (dl_non_contending)
+ * is used to avoid race conditions between the inactive timer handler
+ * and task wakeups.
+ *
+ * The following diagram shows how running_bw is updated. A task is
+ * "ACTIVE" when its utilization contributes to running_bw; an
+ * "ACTIVE contending" task is in the TASK_RUNNING state, while an
+ * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
+ * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
+ * time already passed, which does not contribute to running_bw anymore.
+ * +------------------+
+ * wakeup | ACTIVE |
+ * +------------------>+ contending |
+ * | add_running_bw | |
+ * | +----+------+------+
+ * | | ^
+ * | dequeue | |
+ * +--------+-------+ | |
+ * | | t >= 0-lag | | wakeup
+ * | INACTIVE |<---------------+ |
+ * | | sub_running_bw | |
+ * +--------+-------+ | |
+ * ^ | |
+ * | t < 0-lag | |
+ * | | |
+ * | V |
+ * | +----+------+------+
+ * | sub_running_bw | ACTIVE |
+ * +-------------------+ |
+ * inactive timer | non contending |
+ * fired +------------------+
+ *
+ * The task_non_contending() function is invoked when a task
+ * blocks, and checks if the 0-lag time already passed or
+ * not (in the first case, it directly updates running_bw;
+ * in the second case, it arms the inactive timer).
+ *
+ * The task_contending() function is invoked when a task wakes
+ * up, and checks if the task is still in the "ACTIVE non contending"
+ * state or not (in the second case, it updates running_bw).
+ */
+static void task_non_contending(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->inactive_timer;
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ s64 zerolag_time;
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ WARN_ON(hrtimer_active(&dl_se->inactive_timer));
+ WARN_ON(dl_se->dl_non_contending);
+
+ zerolag_time = dl_se->deadline -
+ div64_long((dl_se->runtime * dl_se->dl_period),
+ dl_se->dl_runtime);
+
+ /*
+ * Using relative times instead of the absolute "0-lag time"
+ * allows to simplify the code
+ */
+ zerolag_time -= rq_clock(rq);
+
+ /*
+ * If the "0-lag time" already passed, decrease the active
+ * utilization now, instead of starting a timer
+ */
+ if (zerolag_time < 0) {
+ if (dl_task(p))
+ sub_running_bw(dl_se->dl_bw, dl_rq);
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD)
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_clear_params(p);
+ raw_spin_unlock(&dl_b->lock);
+ }
+
+ return;
+ }
+
+ dl_se->dl_non_contending = 1;
+ get_task_struct(p);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+}
+
+static void task_contending(struct sched_dl_entity *dl_se, int flags)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (flags & ENQUEUE_MIGRATED)
+ add_rq_bw(dl_se->dl_bw, dl_rq);
+
+ if (dl_se->dl_non_contending) {
+ dl_se->dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
+ put_task_struct(dl_task_of(dl_se));
+ } else {
+ /*
+ * Since "dl_non_contending" is not set, the
+ * task's utilization has already been removed from
+ * active utilization (either when the task blocked,
+ * when the "inactive timer" fired).
+ * So, add it back.
+ */
+ add_running_bw(dl_se->dl_bw, dl_rq);
+ }
+}
+
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -83,6 +332,10 @@ void init_dl_rq(struct dl_rq *dl_rq)
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
+
+ dl_rq->running_bw = 0;
+ dl_rq->this_bw = 0;
+ init_dl_rq_bw_ratio(dl_rq);
}
#ifdef CONFIG_SMP
@@ -484,13 +737,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
+ *
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -500,6 +824,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -593,10 +925,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have changed its scheduling policy to something
* different than SCHED_DEADLINE (through switched_from_dl()).
*/
- if (!dl_task(p)) {
- __dl_clear_params(p);
+ if (!dl_task(p))
goto unlock;
- }
/*
* The task might have been boosted by someone else and might be in the
@@ -723,6 +1053,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
return;
dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+ dl_se->runtime = 0;
}
}
@@ -735,6 +1067,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
/*
+ * This function implements the GRUB accounting rule:
+ * according to the GRUB reclaiming algorithm, the runtime is
+ * not decreased as "dq = -dt", but as
+ * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by
+ * BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
+ */
+u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
+ u64 u_act;
+ u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+
+ /*
+ * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
+ * we compare u_inact + rq->dl.extra_bw with
+ * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
+ * u_inact + rq->dl.extra_bw can be larger than
+ * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
+ * leading to wrong results)
+ */
+ if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
+ u_act = u_act_min;
+ else
+ u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+
+ return (delta * u_act) >> BW_SHIFT;
+}
+
+/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
*/
@@ -776,6 +1149,8 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
+ delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
dl_se->runtime -= delta_exec;
throttle:
@@ -815,6 +1190,56 @@ throttle:
}
}
+static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ inactive_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ dl_se->dl_non_contending = 0;
+ }
+
+ raw_spin_lock(&dl_b->lock);
+ __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(p);
+
+ goto unlock;
+ }
+ if (dl_se->dl_non_contending == 0)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ sub_running_bw(dl_se->dl_bw, &rq->dl);
+ dl_se->dl_non_contending = 0;
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->inactive_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = inactive_task_timer;
+}
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -946,10 +1371,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (flags & ENQUEUE_WAKEUP)
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
update_dl_entity(dl_se, pi_se);
- else if (flags & ENQUEUE_REPLENISH)
+ } else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+ }
__enqueue_dl_entity(dl_se);
}
@@ -959,11 +1386,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
-static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_deadline < dl_se->dl_period;
-}
-
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -995,17 +1417,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
dl_check_constrained_dl(&p->dl);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
+ add_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_running_bw(p->dl.dl_bw, &rq->dl);
+ }
+
/*
- * If p is throttled, we do nothing. In fact, if it exhausted
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
*/
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(&p->dl, flags);
+
return;
+ }
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1023,6 +1460,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
__dequeue_task_dl(rq, p, flags);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ }
+
+ /*
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
+ */
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(p);
}
/*
@@ -1100,6 +1554,37 @@ out:
return cpu;
}
+static void migrate_task_rq_dl(struct task_struct *p)
+{
+ struct rq *rq;
+
+ if (p->state != TASK_WAKING)
+ return;
+
+ rq = task_rq(p);
+ /*
+ * Since p->state == TASK_WAKING, set_task_cpu() has been called
+ * from try_to_wake_up(). Hence, p->pi_lock is locked, but
+ * rq->lock is not... So, lock it
+ */
+ raw_spin_lock(&rq->lock);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ raw_spin_unlock(&rq->lock);
+}
+
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -1255,19 +1740,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void task_dead_dl(struct task_struct *p)
-{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- /*
- * Since we are TASK_DEAD we won't slip out of the domain!
- */
- raw_spin_lock_irq(&dl_b->lock);
- /* XXX we should retain the bw until 0-lag */
- dl_b->total_bw -= p->dl.dl_bw;
- raw_spin_unlock_irq(&dl_b->lock);
-}
-
static void set_curr_task_dl(struct rq *rq)
{
struct task_struct *p = rq->curr;
@@ -1533,7 +2005,7 @@ retry:
* then possible that next_task has migrated.
*/
task = pick_next_pushable_dl_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
@@ -1551,7 +2023,11 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ sub_running_bw(next_task->dl.dl_bw, &rq->dl);
+ sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
+ add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
+ add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -1639,7 +2115,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ sub_running_bw(p->dl.dl_bw, &src_rq->dl);
+ sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
set_task_cpu(p, this_cpu);
+ add_rq_bw(p->dl.dl_bw, &this_rq->dl);
+ add_running_bw(p->dl.dl_bw, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -1695,7 +2175,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw);
+ __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@@ -1737,13 +2217,26 @@ void __init init_sched_dl_class(void)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
/*
- * Start the deadline timer; if we switch back to dl before this we'll
- * continue consuming our current CBS slice. If we stay outside of
- * SCHED_DEADLINE until the deadline passes, the timer will reset the
- * task.
+ * task_non_contending() can start the "inactive timer" (if the 0-lag
+ * time is in the future). If the task switches back to dl before
+ * the "inactive timer" fires, it can continue to consume its current
+ * runtime using its current deadline. If it stays outside of
+ * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
+ * will reset the task parameters.
*/
- if (!start_dl_timer(p))
- __dl_clear_params(p);
+ if (task_on_rq_queued(p) && p->dl.dl_runtime)
+ task_non_contending(p);
+
+ if (!task_on_rq_queued(p))
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+
+ /*
+ * We cannot use inactive_task_timer() to invoke sub_running_bw()
+ * at the 0-lag time, because the task could have been migrated
+ * while SCHED_OTHER in the meanwhile.
+ */
+ if (p->dl.dl_non_contending)
+ p->dl.dl_non_contending = 0;
/*
* Since this might be the only -deadline task on the rq,
@@ -1762,11 +2255,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
/* If p is not queued we will update its parameters at next wakeup. */
- if (!task_on_rq_queued(p))
- return;
+ if (!task_on_rq_queued(p)) {
+ add_rq_bw(p->dl.dl_bw, &rq->dl);
+ return;
+ }
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
@@ -1836,6 +2333,7 @@ const struct sched_class dl_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
+ .migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
@@ -1845,7 +2343,6 @@ const struct sched_class dl_sched_class = {
.set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
- .task_dead = task_dead_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
@@ -1854,6 +2351,317 @@ const struct sched_class dl_sched_class = {
.update_curr = update_curr_dl,
};
+int sched_dl_global_validate(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ struct dl_bw *dl_b;
+ int cpu, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+ if (global_rt_runtime() == RUNTIME_INF) {
+ dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+ dl_rq->extra_bw = 1 << BW_SHIFT;
+ } else {
+ dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+ global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+ dl_rq->extra_bw = to_ratio(global_rt_period(),
+ global_rt_runtime());
+ }
+}
+
+void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ struct dl_bw *dl_b;
+ int cpu;
+ unsigned long flags;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+ }
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+int sched_dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ /* !deadline task may carry old deadline bandwidth */
+ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ if (hrtimer_active(&p->dl.inactive_timer))
+ __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ /*
+ * XXX this is slightly incorrect: when the task
+ * utilization decreases, we should delay the total
+ * utilization change until the task's 0-lag point.
+ * But this would require to set the task's "inactive
+ * timer" when the task is not inactive.
+ */
+ __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ dl_change_utilization(p, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ /*
+ * Do not decrease the total deadline utilization here,
+ * switched_from_dl() will take care to do it at the correct
+ * (0-lag) time.
+ */
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
+void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+}
+
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+bool __checkparam_dl(const struct sched_attr *attr)
+{
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
+}
+
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
+}
+
+bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_SMP
+int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
+{
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus, ret;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw, cpus);
+ ret = 0;
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+ return ret;
+}
+
+int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+ return ret;
+}
+
+bool dl_cpu_busy(unsigned int cpu)
+{
+ unsigned long flags;
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+ return overflow;
+}
+#endif
+
#ifdef CONFIG_SCHED_DEBUG
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);