diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-29 02:43:39 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-29 02:43:39 +0300 |
commit | 3ca9a836ff53db8eb76d559764c07fb3b015886a (patch) | |
tree | edbd56fb3069895e1d8306df53aaa3d742a9bec8 /include | |
parent | 1a7c611546e552193180941ecf6b191e659db979 (diff) | |
parent | 2f88c8e802c8b128a155976631f4eb2ce4f3c805 (diff) | |
download | linux-3ca9a836ff53db8eb76d559764c07fb3b015886a.tar.xz |
Merge tag 'sched-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- The biggest change is introduction of a new iteration of the
SCHED_FAIR interactivity code: the EEVDF ("Earliest Eligible Virtual
Deadline First") scheduler
EEVDF too is a virtual-time scheduler, with two parameters (weight
and relative deadline), compared to CFS that had weight only. It
completely reworks the base scheduler: placement, preemption, picking
-- everything
LWN.net, as usual, has a terrific writeup about EEVDF:
https://lwn.net/Articles/925371/
Preemption (both tick and wakeup) is driven by testing against a
fresh pick. Because the tree is now effectively an interval tree, and
the selection is no longer the 'leftmost' task, over-scheduling is
less of a problem. A lot of the CFS heuristics are removed or
replaced by more natural latency-space parameters & constructs
In terms of expected performance regressions: we will and can fix
everything where a 'good' workload misbehaves with the new scheduler,
but EEVDF inevitably changes workload scheduling in a binary fashion,
hopefully for the better in the overwhelming majority of cases, but
in some cases it won't, especially in adversarial loads that got
lucky with the previous code, such as some variants of hackbench. We
are trying hard to err on the side of fixing all performance
regressions, but we expect some inevitable post-release iterations of
that process
- Improve load-balancing on hybrid x86 systems: enable cluster
scheduling (again)
- Improve & fix bandwidth-scheduling on nohz systems
- Improve bandwidth-throttling
- Use lock guards to simplify and de-goto-ify control flow
- Misc improvements, cleanups and fixes
* tag 'sched-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits)
sched/eevdf/doc: Modify the documented knob to base_slice_ns as well
sched/eevdf: Curb wakeup-preemption
sched: Simplify sched_core_cpu_{starting,deactivate}()
sched: Simplify try_steal_cookie()
sched: Simplify sched_tick_remote()
sched: Simplify sched_exec()
sched: Simplify ttwu()
sched: Simplify wake_up_if_idle()
sched: Simplify: migrate_swap_stop()
sched: Simplify sysctl_sched_uclamp_handler()
sched: Simplify get_nohz_timer_target()
sched/rt: sysctl_sched_rr_timeslice show default timeslice after reset
sched/rt: Fix sysctl_sched_rr_timeslice intial value
sched/fair: Block nohz tick_stop when cfs bandwidth in use
sched, cgroup: Restore meaning to hierarchical_quota
MAINTAINERS: Add Peter explicitly to the psi section
sched/psi: Select KERNFS as needed
sched/topology: Align group flags when removing degenerate domain
sched/fair: remove util_est boosting
sched/fair: Propagate enqueue flags into place_entity()
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cgroup-defs.h | 2 | ||||
-rw-r--r-- | include/linux/rbtree_augmented.h | 26 | ||||
-rw-r--r-- | include/linux/sched.h | 21 | ||||
-rw-r--r-- | include/linux/sched/task.h | 38 |
4 files changed, 79 insertions, 8 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8a0d5466c7be..ae20dbb885d6 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -661,6 +661,8 @@ struct cgroup_subsys { void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); + int (*css_local_stat_show)(struct seq_file *seq, + struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 7ee7ed5de722..6dbc5a1bf6a8 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } +static __always_inline struct rb_node * +rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + const struct rb_augment_callbacks *augment) +{ + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (less(node, parent)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(node, parent, link); + augment->propagate(parent, NULL); /* suboptimal */ + rb_insert_augmented_cached(node, tree, leftmost, augment); + + return leftmost ? node : NULL; +} + /* * Template for declaring augmented rbtree callbacks (generic case) * diff --git a/include/linux/sched.h b/include/linux/sched.h index 609bde814cb0..177b3f3676ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -75,14 +75,14 @@ struct user_event_mm; * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * - * We have two separate sets of flags: task->state + * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ -/* Used in tsk->state: */ +/* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 @@ -92,7 +92,7 @@ struct user_event_mm; #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) -/* Used in tsk->state again: */ +/* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 @@ -173,7 +173,7 @@ struct user_event_mm; #endif /* - * set_current_state() includes a barrier so that the write of current->state + * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * @@ -196,9 +196,9 @@ struct user_event_mm; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before - * accessing p->state. + * accessing p->__state. * - * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, + * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * @@ -549,13 +549,18 @@ struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; + u64 deadline; + u64 min_deadline; + struct list_head group_node; unsigned int on_rq; u64 exec_start; u64 sum_exec_runtime; - u64 vruntime; u64 prev_sum_exec_runtime; + u64 vruntime; + s64 vlag; + u64 slice; u64 nr_migrations; @@ -2433,9 +2438,11 @@ extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); +extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } +static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index dd35ce28bb90..a23af225c898 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -118,11 +118,47 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) } extern void __put_task_struct(struct task_struct *t); +extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); static inline void put_task_struct(struct task_struct *t) { - if (refcount_dec_and_test(&t->usage)) + if (!refcount_dec_and_test(&t->usage)) + return; + + /* + * In !RT, it is always safe to call __put_task_struct(). + * Under RT, we can only call it in preemptible context. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { + static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); + + lock_map_acquire_try(&put_task_map); __put_task_struct(t); + lock_map_release(&put_task_map); + return; + } + + /* + * under PREEMPT_RT, we can't call put_task_struct + * in atomic context because it will indirectly + * acquire sleeping locks. + * + * call_rcu() will schedule delayed_put_task_struct_rcu() + * to be called in process context. + * + * __put_task_struct() is called when + * refcount_dec_and_test(&t->usage) succeeds. + * + * This means that it can't "conflict" with + * put_task_struct_rcu_user() which abuses ->rcu the same + * way; rcu_users has a reference so task->usage can't be + * zero after rcu_users 1 -> 0 transition. + * + * delayed_free_task() also uses ->rcu, but it is only called + * when it fails to fork a process. Therefore, there is no + * way it can conflict with put_task_struct(). + */ + call_rcu(&t->rcu, __put_task_struct_rcu_cb); } DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) |