summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-29 00:53:30 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-29 00:53:30 +0300
commit586b222d748e91c619d68e9239654ebc7fed9b0c (patch)
tree433154fb388d301fe94831f5a5223545d20fb7f3 /include
parent7c339778f908875772c17f2e04ed731aac772881 (diff)
parentf31dcb152a3d0816e2f1deab4e64572336da197d (diff)
downloadlinux-586b222d748e91c619d68e9239654ebc7fed9b0c.tar.xz
Merge tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Allow unprivileged PSI poll()ing - Fix performance regression introduced by mm_cid - Improve livepatch stalls by adding livepatch task switching to cond_resched(). This resolves livepatching busy-loop stalls with certain CPU-bound kthreads - Improve sched_move_task() performance on autogroup configs - On core-scheduling CPUs, avoid selecting throttled tasks to run - Misc cleanups, fixes and improvements * tag 'sched-core-2023-04-27' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/clock: Fix local_clock() before sched_clock_init() sched/rt: Fix bad task migration for rt tasks sched: Fix performance regression introduced by mm_cid sched/core: Make sched_dynamic_mutex static sched/psi: Allow unprivileged polling of N*2s period sched/psi: Extract update_triggers side effect sched/psi: Rename existing poll members in preparation sched/psi: Rearrange polling code in preparation sched/fair: Fix inaccurate tally of ttwu_move_affine vhost: Fix livepatch timeouts in vhost_worker() livepatch,sched: Add livepatch task switching to cond_resched() livepatch: Skip task_call_func() for current task livepatch: Convert stack entries array to percpu sched: Interleave cfs bandwidth timers for improved single thread performance at low utilization sched/core: Reduce cost of sched_move_task when config autogroup sched/core: Avoid selecting the task that is throttled to run when core-sched enable sched/topology: Make sched_energy_mutex,update static
Diffstat (limited to 'include')
-rw-r--r--include/linux/livepatch.h1
-rw-r--r--include/linux/livepatch_sched.h29
-rw-r--r--include/linux/mm_types.h82
-rw-r--r--include/linux/psi.h2
-rw-r--r--include/linux/psi_types.h43
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/sched/mm.h5
7 files changed, 153 insertions, 32 deletions
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 293e29960c6e..9b9b38e89563 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/completion.h>
#include <linux/list.h>
+#include <linux/livepatch_sched.h>
#if IS_ENABLED(CONFIG_LIVEPATCH)
diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h
new file mode 100644
index 000000000000..013794fb5da0
--- /dev/null
+++ b/include/linux/livepatch_sched.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_LIVEPATCH_SCHED_H_
+#define _LINUX_LIVEPATCH_SCHED_H_
+
+#include <linux/jump_label.h>
+#include <linux/static_call_types.h>
+
+#ifdef CONFIG_LIVEPATCH
+
+void __klp_sched_try_switch(void);
+
+#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+static __always_inline void klp_sched_try_switch(void)
+{
+ if (static_branch_unlikely(&klp_sched_try_switch_key))
+ __klp_sched_try_switch();
+}
+
+#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+#else /* !CONFIG_LIVEPATCH */
+static inline void klp_sched_try_switch(void) {}
+static inline void __klp_sched_try_switch(void) {}
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3fc9e680f174..306a3d1a0fa6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -573,6 +573,13 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
+#ifdef CONFIG_SCHED_MM_CID
+struct mm_cid {
+ u64 time;
+ int cid;
+};
+#endif
+
struct kioctx_table;
struct mm_struct {
struct {
@@ -623,15 +630,19 @@ struct mm_struct {
atomic_t mm_count;
#ifdef CONFIG_SCHED_MM_CID
/**
- * @cid_lock: Protect cid bitmap updates vs lookups.
+ * @pcpu_cid: Per-cpu current cid.
*
- * Prevent situations where updates to the cid bitmap happen
- * concurrently with lookups. Those can lead to situations
- * where a lookup cannot find a free bit simply because it was
- * unlucky enough to load, non-atomically, bitmap words as they
- * were being concurrently updated by the updaters.
+ * Keep track of the currently allocated mm_cid for each cpu.
+ * The per-cpu mm_cid values are serialized by their respective
+ * runqueue locks.
*/
- raw_spinlock_t cid_lock;
+ struct mm_cid __percpu *pcpu_cid;
+ /*
+ * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
+ *
+ * When the next mm_cid scan is due (in jiffies).
+ */
+ unsigned long mm_cid_next_scan;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
@@ -899,6 +910,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
}
#ifdef CONFIG_SCHED_MM_CID
+
+enum mm_cid_state {
+ MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
+ MM_CID_LAZY_PUT = (1U << 31),
+};
+
+static inline bool mm_cid_is_unset(int cid)
+{
+ return cid == MM_CID_UNSET;
+}
+
+static inline bool mm_cid_is_lazy_put(int cid)
+{
+ return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
+}
+
+static inline bool mm_cid_is_valid(int cid)
+{
+ return !(cid & MM_CID_LAZY_PUT);
+}
+
+static inline int mm_cid_set_lazy_put(int cid)
+{
+ return cid | MM_CID_LAZY_PUT;
+}
+
+static inline int mm_cid_clear_lazy_put(int cid)
+{
+ return cid & ~MM_CID_LAZY_PUT;
+}
+
/* Accessor for struct mm_struct's cidmask. */
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
{
@@ -912,16 +954,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
static inline void mm_init_cid(struct mm_struct *mm)
{
- raw_spin_lock_init(&mm->cid_lock);
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
+
+ pcpu_cid->cid = MM_CID_UNSET;
+ pcpu_cid->time = 0;
+ }
cpumask_clear(mm_cidmask(mm));
}
+static inline int mm_alloc_cid(struct mm_struct *mm)
+{
+ mm->pcpu_cid = alloc_percpu(struct mm_cid);
+ if (!mm->pcpu_cid)
+ return -ENOMEM;
+ mm_init_cid(mm);
+ return 0;
+}
+
+static inline void mm_destroy_cid(struct mm_struct *mm)
+{
+ free_percpu(mm->pcpu_cid);
+ mm->pcpu_cid = NULL;
+}
+
static inline unsigned int mm_cid_size(void)
{
return cpumask_size();
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm) { }
+static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_cid(struct mm_struct *mm) { }
static inline unsigned int mm_cid_size(void)
{
return 0;
diff --git a/include/linux/psi.h b/include/linux/psi.h
index b029a847def1..ab26200c2803 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res);
+ char *buf, enum psi_res res, struct file *file);
void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 1e0a0d7ace3a..040c089581c6 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -151,6 +151,9 @@ struct psi_trigger {
/* Deferred event(s) from previous ratelimit window */
bool pending_event;
+
+ /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
+ enum psi_aggregators aggregator;
};
struct psi_group {
@@ -171,30 +174,34 @@ struct psi_group {
/* Aggregator work control */
struct delayed_work avgs_work;
+ /* Unprivileged triggers against N*PSI_FREQ windows */
+ struct list_head avg_triggers;
+ u32 avg_nr_triggers[NR_PSI_STATES - 1];
+
/* Total stall times and sampled pressure averages */
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];
- /* Monitor work control */
- struct task_struct __rcu *poll_task;
- struct timer_list poll_timer;
- wait_queue_head_t poll_wait;
- atomic_t poll_wakeup;
- atomic_t poll_scheduled;
+ /* Monitor RT polling work control */
+ struct task_struct __rcu *rtpoll_task;
+ struct timer_list rtpoll_timer;
+ wait_queue_head_t rtpoll_wait;
+ atomic_t rtpoll_wakeup;
+ atomic_t rtpoll_scheduled;
/* Protects data used by the monitor */
- struct mutex trigger_lock;
-
- /* Configured polling triggers */
- struct list_head triggers;
- u32 nr_triggers[NR_PSI_STATES - 1];
- u32 poll_states;
- u64 poll_min_period;
-
- /* Total stall times at the start of monitor activation */
- u64 polling_total[NR_PSI_STATES - 1];
- u64 polling_next_update;
- u64 polling_until;
+ struct mutex rtpoll_trigger_lock;
+
+ /* Configured RT polling triggers */
+ struct list_head rtpoll_triggers;
+ u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
+ u32 rtpoll_states;
+ u64 rtpoll_min_period;
+
+ /* Total stall times at the start of RT polling monitor activation */
+ u64 rtpoll_total[NR_PSI_STATES - 1];
+ u64 rtpoll_next_update;
+ u64 rtpoll_until;
};
#else /* CONFIG_PSI */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3f5395ae86bc..dc4ad4c58fae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
+#include <linux/livepatch_sched.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -1313,7 +1314,10 @@ struct task_struct {
#ifdef CONFIG_SCHED_MM_CID
int mm_cid; /* Current cid in mm */
+ int last_mm_cid; /* Most recent cid in mm */
+ int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
+ struct callback_head cid_work;
#endif
struct tlbflush_unmap_batch tlb_ubc;
@@ -2067,6 +2071,9 @@ extern int __cond_resched(void);
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+void sched_dynamic_klp_enable(void);
+void sched_dynamic_klp_disable(void);
+
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
@@ -2075,6 +2082,7 @@ static __always_inline int _cond_resched(void)
}
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
extern int dynamic_cond_resched(void);
static __always_inline int _cond_resched(void)
@@ -2082,20 +2090,25 @@ static __always_inline int _cond_resched(void)
return dynamic_cond_resched();
}
-#else
+#else /* !CONFIG_PREEMPTION */
static inline int _cond_resched(void)
{
+ klp_sched_try_switch();
return __cond_resched();
}
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+ klp_sched_try_switch();
+ return 0;
+}
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
#define cond_resched() ({ \
__might_resched(__FILE__, __LINE__, 0); \
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index af12fcb11005..b114fbe3a93b 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
atomic_inc(&mm->mm_count);
}
+static inline void smp_mb__after_mmgrab(void)
+{
+ smp_mb__after_atomic();
+}
+
extern void __mmdrop(struct mm_struct *mm);
static inline void mmdrop(struct mm_struct *mm)