From ca528cc501896a808dc79c3c0544369d23b331c8 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 6 Apr 2023 13:31:45 -0700 Subject: sched/topology: Remove SHARED_CHILD from ASYM_PACKING Only x86 and Power7 use ASYM_PACKING. They use it differently. Power7 has cores of equal priority, but the SMT siblings of a core have different priorities. Parent scheduling domains do not need (nor have) the ASYM_PACKING flag. SHARED_CHILD is not needed. Using SHARED_PARENT would cause the topology debug code to complain. X86 has cores of different priority, but all the SMT siblings of the core have equal priority. It needs ASYM_PACKING at the MC level, but not at the SMT level (it also needs it at upper levels if they have scheduling groups of different priority). Removing ASYM_PACKING from the SMT domain causes the topology debug code to complain. Remove SHARED_CHILD for now. We still need a topology check that satisfies both architectures. Suggested-by: Valentin Schneider Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230406203148.19182-10-ricardo.neri-calderon@linux.intel.com --- include/linux/sched/sd_flags.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 57bde66d95f7..fad77b5172e2 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Place busy tasks earlier in the domain * - * SHARED_CHILD: Usually set on the SMT level. Technically could be set further - * up, but currently assumed to be set from the base domain - * upwards (see update_top_cache_domain()). * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) /* * Prefer to place tasks in a sibling domain -- cgit v1.2.3 From d5e1586617be7093ea3419e3fa9387ed833cdbb1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Jun 2023 10:42:53 +0200 Subject: sched: Unconditionally use full-fat wait_task_inactive() While modifying wait_task_inactive() for PREEMPT_RT; the build robot noted that UP got broken. This led to audit and consideration of the UP implementation of wait_task_inactive(). It looks like the UP implementation is also broken for PREEMPT; consider task_current_syscall() getting preempted between the two calls to wait_task_inactive(). Therefore move the wait_task_inactive() implementation out of CONFIG_SMP and unconditionally use it. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230602103731.GA630648%40hirez.programming.kicks-ass.net --- include/linux/sched.h | 7 +- kernel/sched/core.c | 216 +++++++++++++++++++++++++------------------------- 2 files changed, 110 insertions(+), 113 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..1292d38d66cc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2006,15 +2006,12 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); #else static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - return 1; -} #endif +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); + /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 944c3ae39861..810cf7dc98cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2213,6 +2213,114 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq); } +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) +{ + int running, queued; + struct rq_flags rf; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_on_cpu()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_on_cpu(rq, p)) { + if (!(READ_ONCE(p->__state) & match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &rf); + trace_sched_wait_task(p); + running = task_on_cpu(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if (READ_ONCE(p->__state) & match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &rf); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + #ifdef CONFIG_SMP static void @@ -3341,114 +3449,6 @@ out: } #endif /* CONFIG_NUMA_BALANCING */ -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * Wait for the thread to block in any of the states set in @match_state. - * If it changes, i.e. @p might have woken up, then return zero. When we - * succeed in waiting for @p to be off its CPU, we return a positive number - * (its total switch count). If a second call a short while later returns the - * same number, the caller can be sure that @p has remained unscheduled the - * whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - int running, queued; - struct rq_flags rf; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_on_cpu()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_on_cpu(rq, p)) { - if (!(READ_ONCE(p->__state) & match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &rf); - trace_sched_wait_task(p); - running = task_on_cpu(rq, p); - queued = task_on_rq_queued(p); - ncsw = 0; - if (READ_ONCE(p->__state) & match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &rf); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(queued)) { - ktime_t to = NSEC_PER_SEC / HZ; - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread -- cgit v1.2.3 From d16317de9b412aa7bd3598c607112298e36b4352 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:20:59 +0200 Subject: seqlock/latch: Provide raw_read_seqcount_latch_retry() The read side of seqcount_latch consists of: do { seq = raw_read_seqcount_latch(&latch->seq); ... } while (read_seqcount_latch_retry(&latch->seq, seq)); which is asymmetric in the raw_ department, and sure enough, read_seqcount_latch_retry() includes (explicit) instrumentation where raw_read_seqcount_latch() does not. This inconsistency becomes a problem when trying to use it from noinstr code. As such, fix it by renaming and re-implementing raw_read_seqcount_latch_retry() without the instrumentation. Specifically the instrumentation in question is kcsan_atomic_next(0) in do___read_seqcount_retry(). Loosing this annotation is not a problem because raw_read_seqcount_latch() does not pass through kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Petr Mladek Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.233598176@infradead.org --- include/linux/rbtree_latch.h | 2 +- include/linux/seqlock.h | 15 ++++++++------- kernel/printk/printk.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timekeeping.c | 4 ++-- 5 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h index 3d1a9e716b80..6a0999c26c7c 100644 --- a/include/linux/rbtree_latch.h +++ b/include/linux/rbtree_latch.h @@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root, do { seq = raw_read_seqcount_latch(&root->seq); node = __lt_find(key, root, seq & 1, ops->comp); - } while (read_seqcount_latch_retry(&root->seq, seq)); + } while (raw_read_seqcount_latch_retry(&root->seq, seq)); return node; } diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 3926e9027947..987a59d977c5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -671,9 +671,9 @@ typedef struct { * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked - * with read_seqcount_latch_retry(). + * with raw_read_seqcount_latch_retry(). */ -static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) +static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) { /* * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). @@ -683,16 +683,17 @@ static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) } /** - * read_seqcount_latch_retry() - end a seqcount_latch_t read section + * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from raw_read_seqcount_latch() * * Return: true if a read section retry is required, else false */ -static inline int -read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) +static __always_inline int +raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { - return read_seqcount_retry(&s->seqcount, start); + smp_rmb(); + return unlikely(READ_ONCE(s->seqcount.sequence) != start); } /** @@ -752,7 +753,7 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() - * } while (read_seqcount_latch_retry(&latch->seq, seq)); + * } while (raw_read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6a333adce3b3..357a4d18f638 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls) seq = raw_read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; - } while (read_seqcount_latch_retry(&ls->latch, seq)); + } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); return val; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8464c5acc913..e8f2fb09a214 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -77,7 +77,7 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) notrace int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_latch_retry(&cd.seq, seq); + return raw_read_seqcount_latch_retry(&cd.seq, seq); } unsigned long long notrace sched_clock(void) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 09d594900ee0..266d02809dbb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) *mono = basem + delta; -- cgit v1.2.3 From fc4a0db4149afcdae2527f0d8c376accca34adc9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:21:05 +0200 Subject: math64: Always inline u128 version of mul_u64_u64_shr() In order to prevent the following complaint from happening, always inline the u128 variant of mul_u64_u64_shr() -- which is what x86_64 will use. vmlinux.o: warning: objtool: read_hv_sched_clock_tsc+0x5a: call to mul_u64_u64_shr.constprop.0() leaves .noinstr.text section It should compile into something like: asm("mul %[mul];" "shrd %rdx, %rax, %cl" : "+&a" (a) : "c" shift, [mul] "r" (mul) : "d"); Which is silly not to inline, but it happens. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.637420396@infradead.org --- include/linux/math64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/math64.h b/include/linux/math64.h index 8b9191a2849e..bf74478926d4 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -168,7 +168,7 @@ static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr -static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) +static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } -- cgit v1.2.3 From 9397fa2ea3e7634f61da1ab76b9eb88ba04dfdfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:21:07 +0200 Subject: clocksource: hyper-v: Adjust hv_read_tsc_page_tsc() to avoid special casing U64_MAX Currently hv_read_tsc_page_tsc() (ab)uses the (valid) time value of U64_MAX as an error return. This breaks the clean wrap-around of the clock. Modify the function signature to return a boolean state and provide another u64 pointer to store the actual time on success. This obviates the need to steal one time value and restores the full counter width. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michael Kelley Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.775630881@infradead.org --- arch/x86/include/asm/vdso/gettimeofday.h | 10 ++++++---- arch/x86/kvm/x86.c | 7 +++---- drivers/clocksource/hyperv_timer.c | 16 +++++++++++----- include/clocksource/hyperv_timer.h | 24 +++++++++--------------- 4 files changed, 29 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 0badf0a9f03d..c81858d903dc 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -238,10 +238,12 @@ static u64 vread_pvclock(void) #ifdef CONFIG_HYPERV_TIMER static u64 vread_hvclock(void) { - u64 ret = hv_read_tsc_page(&hvclock_page); - if (likely(ret != U64_MAX)) - ret &= S64_MAX; - return ret; + u64 tsc, time; + + if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time)) + return time & S64_MAX; + + return U64_MAX; } #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ceb7c5e9cf9e..99d97ba6104f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2799,14 +2799,13 @@ static u64 read_tsc(void) static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode) { - long v; u64 tsc_pg_val; + long v; switch (clock->vclock_mode) { case VDSO_CLOCKMODE_HVCLOCK: - tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), - tsc_timestamp); - if (tsc_pg_val != U64_MAX) { + if (hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp, &tsc_pg_val)) { /* TSC page valid */ *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index bcd9042a0c9f..c643bfe2f3d4 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -393,14 +393,20 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) } EXPORT_SYMBOL_GPL(hv_get_tsc_page); -static u64 notrace read_hv_clock_tsc(void) +static notrace u64 read_hv_clock_tsc(void) { - u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); + u64 cur_tsc, time; - if (current_tick == U64_MAX) - current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT); + /* + * The Hyper-V Top-Level Function Spec (TLFS), section Timers, + * subsection Refererence Counter, guarantees that the TSC and MSR + * times are in sync and monotonic. Therefore we can fall back + * to the MSR in case the TSC page indicates unavailability. + */ + if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) + time = hv_get_register(HV_REGISTER_TIME_REF_COUNT); - return current_tick; + return time; } static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 536f897375d0..6cdc873ac907 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,8 +38,9 @@ extern void hv_remap_tsc_clocksource(void); extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline notrace u64 -hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) +static __always_inline bool +hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc, u64 *time) { u64 scale, offset; u32 sequence; @@ -63,7 +64,7 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) do { sequence = READ_ONCE(tsc_pg->tsc_sequence); if (!sequence) - return U64_MAX; + return false; /* * Make sure we read sequence before we read other values from * TSC page. @@ -82,15 +83,8 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc) } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; -} - -static inline notrace u64 -hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) -{ - u64 cur_tsc; - - return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); + *time = mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; + return true; } #else /* CONFIG_HYPERV_TIMER */ @@ -104,10 +98,10 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) return NULL; } -static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, - u64 *cur_tsc) +static __always_inline bool +hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) { - return U64_MAX; + return false; } static inline int hv_stimer_cleanup(unsigned int cpu) { return 0; } -- cgit v1.2.3 From fb7d4948c4da2dbd26da4b7ec76bbd2f19ff862a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2023 12:21:10 +0200 Subject: sched/clock: Provide local_clock_noinstr() Now that all ARCH_WANTS_NO_INSTR architectures (arm64, loongarch, s390, x86) provide sched_clock_noinstr(), use this to provide local_clock_noinstr(). This local_clock_noinstr() will be safe to use from noinstr code with the assumption that any such noinstr code is non-preemptible (it had better be, entry code will have IRQs disabled while __cpuidle must have preemption disabled). Specifically, preempt_enable_notrace(), a common part of many a sched_clock() implementation calls out to schedule() -- even though, per the above, it will never trigger -- which frustrates noinstr validation. vmlinux.o: warning: objtool: local_clock+0xb5: call to preempt_schedule_notrace_thunk() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Tested-by: Michael Kelley # Hyper-V Link: https://lore.kernel.org/r/20230519102715.978624636@infradead.org --- include/linux/sched/clock.h | 17 ++++++++++++++++- kernel/sched/clock.c | 19 +++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index ca008f7d3615..196f0ca351a2 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -12,7 +12,16 @@ * * Please use one of the three interfaces below. */ -extern unsigned long long notrace sched_clock(void); +extern u64 sched_clock(void); + +#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK) +extern u64 sched_clock_noinstr(void); +#else +static __always_inline u64 sched_clock_noinstr(void) +{ + return sched_clock(); +} +#endif /* * See the comment in kernel/sched/clock.c @@ -45,6 +54,11 @@ static inline u64 cpu_clock(int cpu) return sched_clock(); } +static __always_inline u64 local_clock_noinstr(void) +{ + return sched_clock_noinstr(); +} + static __always_inline u64 local_clock(void) { return sched_clock(); @@ -79,6 +93,7 @@ static inline u64 cpu_clock(int cpu) return sched_clock_cpu(cpu); } +extern u64 local_clock_noinstr(void); extern u64 local_clock(void); #endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b5cc2b53464d..5a575a0ba4e6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) s64 delta; again: - now = sched_clock(); + now = sched_clock_noinstr(); delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; @@ -293,22 +293,29 @@ again: return clock; } -noinstr u64 local_clock(void) +noinstr u64 local_clock_noinstr(void) { u64 clock; if (static_branch_likely(&__sched_clock_stable)) - return sched_clock() + __sched_clock_offset; + return sched_clock_noinstr() + __sched_clock_offset; if (!static_branch_likely(&sched_clock_running)) - return sched_clock(); + return sched_clock_noinstr(); - preempt_disable_notrace(); clock = sched_clock_local(this_scd()); - preempt_enable_notrace(); return clock; } + +u64 local_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = local_clock_noinstr(); + preempt_enable_notrace(); + return now; +} EXPORT_SYMBOL_GPL(local_clock); static notrace u64 sched_clock_remote(struct sched_clock_data *scd) -- cgit v1.2.3 From 0cce0fde499a92c726cd2e24f7763644f7c9f971 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 3 Jun 2023 15:36:45 +0800 Subject: sched/topology: Mark set_sched_topology() __init All callers of set_sched_topology() are within __init section. Mark it __init too. Signed-off-by: Miaohe Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20230603073645.1173332-1-linmiaohe@huawei.com --- include/linux/sched/topology.h | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 816df6cc444e..67b573d5bf28 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -203,7 +203,7 @@ struct sched_domain_topology_level { #endif }; -extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void __init set_sched_topology(struct sched_domain_topology_level *tl); #ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ca4472281c28..cb92dc5f5646 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1681,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -void set_sched_topology(struct sched_domain_topology_level *tl) +void __init set_sched_topology(struct sched_domain_topology_level *tl) { if (WARN_ON_ONCE(sched_smp_initialized)) return; -- cgit v1.2.3 From ef73d6a4ef0b35524125c3cfc6deafc26a0c966a Mon Sep 17 00:00:00 2001 From: Arve Hjønnevåg Date: Fri, 2 Jun 2023 21:23:46 +0000 Subject: sched/wait: Fix a kthread_park race with wait_woken() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kthread_park and wait_woken have a similar race that kthread_stop and wait_woken used to have before it was fixed in commit cb6538e740d7 ("sched/wait: Fix a kthread race with wait_woken()"). Extend that fix to also cover kthread_park. [jstultz: Made changes suggested by Peter to optimize memory loads] Signed-off-by: Arve Hjønnevåg Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20230602212350.535358-1-jstultz@google.com --- include/linux/kthread.h | 1 + kernel/kthread.c | 10 ++++++++++ kernel/sched/wait.c | 7 +------ 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 30e5bec81d2b..f1f95a71a4bc 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -89,6 +89,7 @@ int kthread_stop(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); bool __kthread_should_park(struct task_struct *k); +bool kthread_should_stop_or_park(void); bool kthread_freezable_should_stop(bool *was_frozen); void *kthread_func(struct task_struct *k); void *kthread_data(struct task_struct *k); diff --git a/kernel/kthread.c b/kernel/kthread.c index 490792b1066e..07a057086d26 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -182,6 +182,16 @@ bool kthread_should_park(void) } EXPORT_SYMBOL_GPL(kthread_should_park); +bool kthread_should_stop_or_park(void) +{ + struct kthread *kthread = __to_kthread(current); + + if (!kthread) + return false; + + return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); +} + /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 133b74730738..48c53e4739ea 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i } EXPORT_SYMBOL(autoremove_wake_function); -static inline bool is_kthread_should_stop(void) -{ - return (current->flags & PF_KTHREAD) && kthread_should_stop(); -} - /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * @@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) * or woken_wake_function() sees our store to current->state. */ set_current_state(mode); /* A */ - if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); -- cgit v1.2.3