From 698429f9d0e54ce3964151adff886ee5fc59714b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:17 +0200 Subject: clocksource: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-35-bigeasy@linutronix.de --- kernel/time/clocksource.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b89c76e1c02c..b8a14d2fb5ba 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -306,12 +306,12 @@ void clocksource_verify_percpu(struct clocksource *cs) return; cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); - get_online_cpus(); + cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); if (cpumask_weight(&cpus_chosen) == 0) { preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } @@ -337,7 +337,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_ahead), testcpu, cs->name); -- cgit v1.2.3 From 1dae37c7e41d9a75a615ba7b0480acc2e04094d4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 13:01:47 +0100 Subject: posix-timers: Remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210721120147.109570-1-colin.king@canonical.com --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..3913222e7bcf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -336,7 +336,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; - int ret = -1; + int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). -- cgit v1.2.3 From a5dec9f82ab2ae486119f0b0820ea16db3e522c3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:08 +0200 Subject: posix-cpu-timers: Assert task sighand is locked while starting cputime counter Starting the process wide cputime counter needs to be done in the same sighand locking sequence than actually arming the related timer otherwise this races against concurrent timers setting/expiring in the same threadgroup. Detecting that the cputime counter is started without holding the sighand lock is a first step toward debugging such situations. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-2-frederic@kernel.org --- include/linux/sched/signal.h | 6 ++++++ kernel/signal.c | 15 +++++++++++++++ kernel/time/posix-cpu-timers.c | 2 ++ 3 files changed, 23 insertions(+) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b9126fe06c3f..0310a5add9ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -714,6 +714,12 @@ static inline void unlock_task_sighand(struct task_struct *task, spin_unlock_irqrestore(&task->sighand->siglock, *flags); } +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_task_sighand_held(struct task_struct *task); +#else +static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } +#endif + static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..52b6abec0ff8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1413,6 +1413,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, return sighand; } +#ifdef CONFIG_LOCKDEP +void lockdep_assert_task_sighand_held(struct task_struct *task) +{ + struct sighand_struct *sighand; + + rcu_read_lock(); + sighand = rcu_dereference(task->sighand); + if (sighand) + lockdep_assert_held(&sighand->siglock); + else + WARN_ON_ONCE(1); + rcu_read_unlock(); +} +#endif + /* * send signal info to all the members of a group */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 517be7fd175e..4693d3c71e7e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -291,6 +291,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + lockdep_assert_task_sighand_held(tsk); + /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(pct->timers_active)) { struct task_cputime sum; -- cgit v1.2.3 From 175cc3ab28e3509ddee8de4f164b563d99daa570 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:09 +0200 Subject: posix-cpu-timers: Force next_expiration recalc after timer deletion A timer deletion only dequeues the timer but it doesn't shutdown the related costly process wide cputimer counter and the tick dependency. The following code snippet keeps this overhead around for one week after the timer deletion: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; val.it_value.tv_sec = 604800; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, 0, &val, NULL); timer_delete(id); } Make sure the next target's tick recalculates the nearest expiration and clears the process wide counter and tick dependency if necessary. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-3-frederic@kernel.org --- include/linux/posix-timers.h | 4 +++- kernel/time/posix-cpu-timers.c | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 896c16d2c5fb..4cf1fbe8d1bc 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,12 +82,14 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } -static inline void cpu_timer_dequeue(struct cpu_timer *ctmr) +static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (ctmr->head) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; + return true; } + return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4693d3c71e7e..61c78b62fe6a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,6 +407,37 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +/* + * Dequeue the timer and reset the base if it was its earliest expiration. + * It makes sure the next tick recalculates the base next expiration so we + * don't keep the costly process wide cputime counter around for a random + * amount of time, along with the tick dependency. + * + * If another timer gets queued between this and the next tick, its + * expiration will update the base next event if necessary on the next + * tick. + */ +static void disarm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct posix_cputimer_base *base; + int clkidx; + + if (!cpu_timer_dequeue(ctmr)) + return; + + clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + base = p->posix_cputimers.bases + clkidx; + else + base = p->signal->posix_cputimers.bases + clkidx; + + if (cpu_timer_getexpires(ctmr) == base->nextevt) + base->nextevt = 0; +} + + /* * Clean up a CPU-clock timer that is about to be destroyed. * This is called from timer deletion with the timer already locked. @@ -441,7 +472,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer) if (timer->it.cpu.firing) ret = TIMER_RETRY; else - cpu_timer_dequeue(ctmr); + disarm_timer(timer, p); unlock_task_sighand(p, &flags); } -- cgit v1.2.3 From 406dd42bd1ba0c01babf9cde169bb319e52f6147 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:10 +0200 Subject: posix-cpu-timers: Force next expiration recalc after itimer reset When an itimer deactivates a previously armed expiration, it simply doesn't do anything. As a result the process wide cputime counter keeps running and the tick dependency stays set until it reaches the old ghost expiration value. This can be reproduced with the following snippet: void trigger_process_counter(void) { struct itimerval n = {}; n.it_value.tv_sec = 100; setitimer(ITIMER_VIRTUAL, &n, NULL); n.it_value.tv_sec = 0; setitimer(ITIMER_VIRTUAL, &n, NULL); } Fix this with resetting the relevant base expiration. This is similar to disarming a timer. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-4-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 61c78b62fe6a..5c71322b45c7 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1379,8 +1379,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - if (!*newval) - return; *newval += now; } -- cgit v1.2.3 From d9c1b2a1089f606404284b9f5b045a584d73382d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:11 +0200 Subject: posix-cpu-timers: Remove confusing return value override The end of the function cannot be reached with an error in variable ret. Unconfuse reviewers about that. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-5-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5c71322b45c7..4fbbbc89ac4a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -744,8 +744,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ cpu_timer_fire(timer); } - - ret = 0; out: rcu_read_unlock(); if (old) -- cgit v1.2.3 From 5c8f23e6b73c13d9f7b52614783dcb9169883296 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:12 +0200 Subject: posix-cpu-timers: Consolidate timer base accessor Remove the ad-hoc timer base accessors and provide a consolidated one. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-6-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4fbbbc89ac4a..0d918117a3e0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,6 +407,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +static struct posix_cputimer_base *timer_base(struct k_itimer *timer, + struct task_struct *tsk) +{ + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + return tsk->posix_cputimers.bases + clkidx; + else + return tsk->signal->posix_cputimers.bases + clkidx; +} + /* * Dequeue the timer and reset the base if it was its earliest expiration. * It makes sure the next tick recalculates the base next expiration so we @@ -421,18 +432,11 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) { struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; - int clkidx; if (!cpu_timer_dequeue(ctmr)) return; - clkidx = CPUCLOCK_WHICH(timer->it_clock); - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; - + base = timer_base(timer, p); if (cpu_timer_getexpires(ctmr) == base->nextevt) base->nextevt = 0; } @@ -531,15 +535,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) */ static void arm_timer(struct k_itimer *timer, struct task_struct *p) { - int clkidx = CPUCLOCK_WHICH(timer->it_clock); + struct posix_cputimer_base *base = timer_base(timer, p); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct posix_cputimer_base *base; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; -- cgit v1.2.3 From ee375328f579f94251eb66d5dc91aba056019a31 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:13 +0200 Subject: posix-cpu-timers: Recalc next expiration when timer_settime() ends up not queueing There are several scenarios that can result in posix_cpu_timer_set() not queueing the timer but still leaving the threadgroup cputime counter running or keeping the tick dependency around for a random amount of time. 1) If timer_settime() is called with a 0 expiration on a timer that is already disabled, the process wide cputime counter will be started and won't ever get a chance to be stopped by stop_process_timer() since no timer is actually armed to be processed. The following snippet is enough to trigger the issue. void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, TIMER_ABSTIME, &val, NULL); timer_delete(id); } 2) If timer_settime() is called with a 0 expiration on a timer that is already armed, the timer is dequeued but not really disarmed. So the process wide cputime counter and the tick dependency may still remain a while around. The following code snippet keeps this overhead around for one week after the timer deletion: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; val.it_value.tv_sec = 604800; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, 0, &val, NULL); timer_delete(id); } 3) If the timer was initially deactivated, this call to timer_settime() with an early expiration may have started the process wide cputime counter even though the timer hasn't been queued and armed because it has fired early and inline within posix_cpu_timer_set() itself. As a result the process wide cputime counter may never stop until a new timer is ever armed in the future. The following code snippet can reproduce this: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; signal(SIGALRM, SIG_IGN); timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); val.it_value.tv_nsec = 1; timer_settime(id, TIMER_ABSTIME, &val, NULL); } 4) If the timer was initially armed with a former expiration value before this call to timer_settime() and the current call sets an early deadline that has already expired, the timer fires inline within posix_cpu_timer_set(). In this case it must have been dequeued before firing inline with its new expiration value, yet it hasn't been disarmed in this case. So the process wide cputime counter and the tick dependency may still be around for a while even after the timer fired. The following code snippet can reproduce this: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; signal(SIGALRM, SIG_IGN); timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); val.it_value.tv_sec = 100; timer_settime(id, TIMER_ABSTIME, &val, NULL); val.it_value.tv_sec = 0; val.it_value.tv_nsec = 1; timer_settime(id, TIMER_ABSTIME, &val, NULL); } Fix all these issues with triggering the related base next expiration recalculation on the next tick. This also implies to re-evaluate the need to keep around the process wide cputime counter and the tick dependency, in a similar fashion to disarm_timer(). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-7-frederic@kernel.org --- include/linux/posix-timers.h | 7 ++++++- kernel/time/posix-cpu-timers.c | 41 +++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4cf1fbe8d1bc..00fef0064355 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,9 +82,14 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } +static inline bool cpu_timer_queued(struct cpu_timer *ctmr) +{ + return !!ctmr->head; +} + static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { - if (ctmr->head) { + if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0d918117a3e0..ee736861b18f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -418,6 +418,20 @@ static struct posix_cputimer_base *timer_base(struct k_itimer *timer, return tsk->signal->posix_cputimers.bases + clkidx; } +/* + * Force recalculating the base earliest expiration on the next tick. + * This will also re-evaluate the need to keep around the process wide + * cputime counter and tick dependency and eventually shut these down + * if necessary. + */ +static void trigger_base_recalc_expires(struct k_itimer *timer, + struct task_struct *tsk) +{ + struct posix_cputimer_base *base = timer_base(timer, tsk); + + base->nextevt = 0; +} + /* * Dequeue the timer and reset the base if it was its earliest expiration. * It makes sure the next tick recalculates the base next expiration so we @@ -438,7 +452,7 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) base = timer_base(timer, p); if (cpu_timer_getexpires(ctmr) == base->nextevt) - base->nextevt = 0; + trigger_base_recalc_expires(timer, p); } @@ -734,13 +748,28 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires != 0 && !(val < new_expires)) { + if (val >= new_expires) { + if (new_expires != 0) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. + * Make sure we don't keep around the process wide cputime + * counter or the tick dependency if they are not necessary. */ - cpu_timer_fire(timer); + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + + if (!cpu_timer_queued(ctmr)) + trigger_base_recalc_expires(timer, p); + + unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); -- cgit v1.2.3 From 627ef5ae2df8eeccb20d5af0e4cfa4df9e61ed28 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:46 +0200 Subject: hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() If __hrtimer_start_range_ns() is invoked with an already armed hrtimer then the timer has to be canceled first and then added back. If the timer is the first expiring timer then on removal the clockevent device is reprogrammed to the next expiring timer to avoid that the pending expiry fires needlessly. If the new expiry time ends up to be the first expiry again then the clock event device has to reprogrammed again. Avoid this by checking whether the timer is the first to expire and in that case, keep the timer on the current CPU and delay the reprogramming up to the point where the timer has been enqueued again. Reported-by: Lorenzo Colitti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135157.873137732@linutronix.de --- kernel/time/hrtimer.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4a66725b1d4a..ba2e0d0a0e5a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1030,12 +1030,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -1048,8 +1049,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1103,9 +1112,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1115,9 +1146,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } + + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; - return enqueue_hrtimer(timer, new_base, mode); + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; } /** @@ -1183,7 +1229,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); -- cgit v1.2.3 From b14bca97c9f5c3e3f133445b01c723e95490d843 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Jul 2021 15:39:47 +0200 Subject: hrtimer: Consolidate reprogramming code This code is mostly duplicated. The redudant store in the force reprogram case does no harm and the in hrtimer interrupt condition cannot be true for the force reprogram invocations. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.054424875@linutronix.de --- kernel/time/hrtimer.c | 72 +++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ba2e0d0a0e5a..5f7c46598d9f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,21 +652,24 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +__hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal, + struct hrtimer *next_timer, ktime_t expires_next) { - ktime_t expires_next; + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; - expires_next = hrtimer_update_next_event(cpu_base); + if (expires_next > cpu_base->expires_next) + return; if (skip_equal && expires_next == cpu_base->expires_next) return; + cpu_base->next_timer = next_timer; cpu_base->expires_next = expires_next; /* @@ -689,7 +692,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + __hrtimer_reprogram(cpu_base, skip_equal, cpu_base->next_timer, + expires_next); } /* High resolution timer related functions */ @@ -835,40 +854,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; - /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. - */ - if (cpu_base->in_hrtirq) - return; - - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; - cpu_base->expires_next = expires; - - /* - * If hres is not active, hardware does not have to be - * programmed yet. - * - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) - return; - - /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. - */ - tick_program_event(expires, 1); + __hrtimer_reprogram(cpu_base, true, timer, expires); } /* -- cgit v1.2.3 From 8c3b5e6ec0fee18bc2ce38d1dfe913413205f908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:48 +0200 Subject: hrtimer: Ensure timerfd notification for HIGHRES=n If high resolution timers are disabled the timerfd notification about a clock was set event is not happening for all cases which use clock_was_set_delayed() because that's a NOP for HIGHRES=n, which is wrong. Make clock_was_set_delayed() unconditially available to fix that. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.196661266@linutronix.de --- include/linux/hrtimer.h | 5 ----- kernel/time/hrtimer.c | 32 ++++++++++++++++---------------- kernel/time/tick-internal.h | 3 +++ 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bb5e7b0a4274..77295af72426 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -318,16 +318,12 @@ struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern void clock_was_set_delayed(void); - extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void clock_was_set_delayed(void) { } - #endif static inline ktime_t @@ -351,7 +347,6 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) timer->base->get_time()); } -extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5f7c46598d9f..7ebf642b53f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -777,22 +777,6 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} - -static DECLARE_WORK(hrtimer_work, clock_was_set_work); - -/* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. - */ -void clock_was_set_delayed(void) -{ - schedule_work(&hrtimer_work); -} - #else static inline int hrtimer_is_hres_enabled(void) { return 0; } @@ -877,6 +861,22 @@ void clock_was_set(void) timerfd_clock_was_set(); } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogram the hrtimer + * interrupt device on all cpus and to notify timerfd. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + /* * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6a742a29e545..cd610faa2523 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,6 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); + +void clock_was_set(void); +void clock_was_set_delayed(void); -- cgit v1.2.3 From e71a4153b7c256ec103e79875398553808aeffd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:49 +0200 Subject: hrtimer: Force clock_was_set() handling for the HIGHRES=n, NOHZ=y case When CONFIG_HIGH_RES_TIMERS is disabled, but NOHZ is enabled then clock_was_set() is not doing anything. With HIGHRES=n the kernel relies on the periodic tick to update the clock offsets, but when NOHZ is enabled and active then CPUs which are in a deep idle sleep do not have a periodic tick which means the expiry of timers affected by clock_was_set() can be arbitrarily delayed up to the point where the CPUs are brought out of idle again. Make the clock_was_set() logic unconditionaly available so that idle CPUs are kicked out of idle to handle the update. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.288697903@linutronix.de --- kernel/time/hrtimer.c | 87 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 7ebf642b53f9..214fd65a9597 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -739,23 +739,7 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - - if (!__hrtimer_hres_active(base)) - return; - - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} +static void retrigger_next_event(void *arg); /* * Switch to high resolution mode @@ -781,9 +765,50 @@ static void hrtimer_switch_to_hres(void) static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + /* + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. + * + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + */ + if (!__hrtimer_hres_active(base) && !tick_nohz_active) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (__hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); +} /* * When a timer is enqueued and expires earlier than the already enqueued @@ -842,22 +867,28 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) } /* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(void) { -#ifdef CONFIG_HIGH_RES_TIMERS + if (!hrtimer_hres_active() && !tick_nohz_active) + goto out_timerfd; + /* Retrigger the CPU local events everywhere */ on_each_cpu(retrigger_next_event, NULL, 1); -#endif + +out_timerfd: timerfd_clock_was_set(); } -- cgit v1.2.3 From 66f7b0c8aadd2785fc29f2c71477ebc16f4e38cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:50 +0200 Subject: timerfd: Provide timerfd_resume() Resuming timekeeping is a clock-was-set event and uses the clock-was-set notification mechanism. This is in the way of making the clock-was-set update for hrtimers selective so unnecessary IPIs are avoided when a CPU base does not have timers queued which are affected by the clock setting. Provide a seperate timerfd_resume() interface so the resume logic and the clock-was-set mechanism can be distangled in the core code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.395287410@linutronix.de --- fs/timerfd.c | 16 ++++++++++++++++ include/linux/hrtimer.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index c5509d2448e3..e9c96a0c79f1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -115,6 +115,22 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } +static void timerfd_resume_work(struct work_struct *work) +{ + timerfd_clock_was_set(); +} + +static DECLARE_WORK(timerfd_work, timerfd_resume_work); + +/* + * Invoked from timekeeping_resume(). Defer the actual update to work so + * timerfd_clock_was_set() runs in task context. + */ +void timerfd_resume(void) +{ + schedule_work(&timerfd_work); +} + static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 77295af72426..253c6e25f331 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -349,8 +349,10 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); +extern void timerfd_resume(void); #else static inline void timerfd_clock_was_set(void) { } +static inline void timerfd_resume(void) { } #endif extern void hrtimers_resume(void); -- cgit v1.2.3 From a761a67f591a8c7476c30bb20ed0f09fdfb1a704 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:51 +0200 Subject: timekeeping: Distangle resume and clock-was-set events Resuming timekeeping is a clock-was-set event and uses the clock-was-set notification mechanism. This is in the way of making the clock-was-set update for hrtimers selective so unnecessary IPIs are avoided when a CPU base does not have timers queued which are affected by the clock setting. Distangle it by invoking hrtimer_resume() on each unfreezing CPU and invoke the new timerfd_resume() function from timekeeping_resume() which is the only place where this is needed. Rename hrtimer_resume() to hrtimer_resume_local() to reflect the change. With this the clock_was_set*() functions are not longer required to IPI all CPUs unconditionally and can get some smarts to avoid them. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.488853478@linutronix.de --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 15 ++++++--------- kernel/time/tick-common.c | 7 +++++++ kernel/time/tick-internal.h | 2 ++ kernel/time/timekeeping.c | 4 +++- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 253c6e25f331..0ee140176f10 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -354,7 +354,6 @@ extern void timerfd_resume(void); static inline void timerfd_clock_was_set(void) { } static inline void timerfd_resume(void) { } #endif -extern void hrtimers_resume(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 214fd65a9597..68e56f0ecb09 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -900,8 +900,8 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus and to notify timerfd. + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { @@ -909,18 +909,15 @@ void clock_was_set_delayed(void) } /* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. */ -void hrtimers_resume(void) +void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d663249652ef..46789356f856 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -470,6 +470,13 @@ void tick_resume_local(void) else tick_resume_oneshot(); } + + /* + * Ensure that hrtimers are up to date and the clockevents device + * is reprogrammed correctly when high resolution timers are + * enabled. + */ + hrtimers_resume_local(); } /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cd610faa2523..22de98cc6dd8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -168,3 +168,5 @@ void timer_clear_idle(void); void clock_was_set(void); void clock_was_set_delayed(void); + +void hrtimers_resume_local(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8a364aa9881a..c8a9b9e54c9d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1810,8 +1810,10 @@ void timekeeping_resume(void) touch_softlockup_watchdog(); + /* Resume the clockevent device(s) and hrtimers */ tick_resume(); - hrtimers_resume(); + /* Notify timerfd as resume is equivalent to clock_was_set() */ + timerfd_resume(); } int timekeeping_suspend(void) -- cgit v1.2.3 From 1b267793f4fd9a089ea8558f3b6698186b9a3214 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:52 +0200 Subject: time/timekeeping: Avoid invoking clock_was_set() twice do_adjtimex() might end up scheduling a delayed clock_was_set() via timekeeping_advance() and then invoke clock_was_set() directly which is pointless. Make timekeeping_advance() return whether an invocation of clock_was_set() is required and handle it at the call sites which allows do_adjtimex() to issue a single direct call if required. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.580966888@linutronix.de --- kernel/time/timekeeping.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c8a9b9e54c9d..19ed58e97b57 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2127,7 +2127,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static void timekeeping_advance(enum timekeeping_adv_mode mode) +static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; @@ -2198,9 +2198,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - if (clock_set) - /* Have to call _delayed version, since in irq context*/ - clock_was_set_delayed(); + + return !!clock_set; } /** @@ -2209,7 +2208,8 @@ out: */ void update_wall_time(void) { - timekeeping_advance(TK_ADV_TICK); + if (timekeeping_advance(TK_ADV_TICK)) + clock_was_set_delayed(); } /** @@ -2389,8 +2389,9 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; - unsigned long flags; + bool clock_set = false; struct timespec64 ts; + unsigned long flags; s32 orig_tai, tai; int ret; @@ -2425,6 +2426,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + clock_set = true; } tk_update_leap_state(tk); @@ -2435,9 +2437,9 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - timekeeping_advance(TK_ADV_FREQ); + clock_set |= timekeeping_advance(TK_ADV_FREQ); - if (tai != orig_tai) + if (clock_set) clock_was_set(); ntp_notify_cmos_timer(); -- cgit v1.2.3 From 17a1b8826b451c80e7999a7c68e06b70579b2b8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:53 +0200 Subject: hrtimer: Add bases argument to clock_was_set() clock_was_set() unconditionaly invokes retrigger_next_event() on all online CPUs. This was necessary because that mechanism was also used for resume from suspend to idle which is not longer the case. The bases arguments allows the callers of clock_was_set() to hand in a mask which tells clock_was_set() which of the hrtimer clock bases are affected by the clock setting. This mask will be used in the next step to check whether a CPU base has timers queued on a clock base affected by the event and avoid the SMP function call if there are none. Add a @bases argument, provide defines for the active bases masking and fixup all callsites. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.691083465@linutronix.de --- kernel/time/hrtimer.c | 4 ++-- kernel/time/tick-internal.h | 9 ++++++++- kernel/time/timekeeping.c | 14 +++++++------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 68e56f0ecb09..c8af165c04eb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ -void clock_was_set(void) +void clock_was_set(unsigned int bases) { if (!hrtimer_hres_active() && !tick_nohz_active) goto out_timerfd; @@ -894,7 +894,7 @@ out_timerfd: static void clock_was_set_work(struct work_struct *work) { - clock_was_set(); + clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 22de98cc6dd8..3548f0829e6d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -166,7 +166,14 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); -void clock_was_set(void); +#define CLOCK_SET_WALL \ + (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \ + BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT)) + +#define CLOCK_SET_BOOT \ + (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT)) + +void clock_was_set(unsigned int bases); void clock_was_set_delayed(void); void hrtimers_resume_local(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 19ed58e97b57..b348749a9fc6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,8 +1323,8 @@ out: write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); if (!ret) audit_tk_injoffset(ts_delta); @@ -1371,8 +1371,8 @@ error: /* even if we error out, we forwarded the time, so call update */ write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1746,8 +1746,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif @@ -2440,7 +2440,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(); + clock_was_set(CLOCK_REALTIME); ntp_notify_cmos_timer(); -- cgit v1.2.3 From 81d741d3460ca422843ce0ec8351083f259c6166 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 13 Jul 2021 15:39:54 +0200 Subject: hrtimer: Avoid unnecessary SMP function calls in clock_was_set() Setting of clocks triggers an unconditional SMP function call on all online CPUs to reprogram the clock event device. However, only some clocks have their offsets updated and therefore potentially require a reprogram. That's CLOCK_REALTIME and CLOCK_TAI and in the case of resume (delayed sleep time injection) also CLOCK_BOOTTIME. Instead of sending an IPI unconditionally, check each per CPU hrtimer base whether it has active timers in the affected clock bases which are indicated by the caller in the @bases argument of clock_was_set(). If that's not the case, skip the IPI and update the offsets remotely which ensures that any subsequently armed timers on the affected clocks are evaluated with the correct offsets. [ tglx: Adopted to the new bases argument, removed the softirq_active check, added comment, fixed up stale comment ] Signed-off-by: Marcelo Tosatti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.787536542@linutronix.de --- kernel/time/hrtimer.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c8af165c04eb..5d44c90d41ea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -882,11 +882,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) */ void clock_was_set(unsigned int bases) { + cpumask_var_t mask; + int cpu; + if (!hrtimer_hres_active() && !tick_nohz_active) goto out_timerfd; - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + /* + * Only send the IPI when there are timers queued in one of + * the affected clock bases. Otherwise update the base + * remote to ensure that the next enqueue of a timer on + * such a clock base will see the correct offsets. + */ + if (cpu_base->active_bases & bases) + cpumask_set_cpu(cpu, mask); + else + hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); -- cgit v1.2.3 From 1e7f7fbcd40c69d23e3fe641ead9f3dc128fa8aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:55 +0200 Subject: hrtimer: Avoid more SMP function calls in clock_was_set() By unconditionally updating the offsets there are more indicators whether the SMP function calls on clock_was_set() can be avoided: - When the offset update already happened on the remote CPU then the remote update attempt will yield the same seqeuence number and no IPI is required. - When the remote CPU is currently handling hrtimer_interrupt(). In that case the remote CPU will reevaluate the timer bases before reprogramming anyway, so nothing to do. - After updating it can be checked whether the first expiring timer in the affected clock bases moves before the first expiring (softirq) timer of the CPU. If that's not the case then sending the IPI is not required. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.887322464@linutronix.de --- kernel/time/hrtimer.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5d44c90d41ea..88aefc35f7d2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -866,6 +866,68 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) __hrtimer_reprogram(cpu_base, true, timer, expires); } +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; + + /* + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. + * + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. + */ + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); + + /* + * If the sequence did not change over the update then the + * remote CPU already handled it. + */ + if (seq == cpu_base->clock_was_set_seq) + return false; + + /* + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. + */ + if (cpu_base->in_hrtirq) + return false; + + /* + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. + */ + active &= cpu_base->active_bases; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; +} + /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). @@ -900,16 +962,10 @@ void clock_was_set(unsigned int bases) unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); - /* - * Only send the IPI when there are timers queued in one of - * the affected clock bases. Otherwise update the base - * remote to ensure that the next enqueue of a timer on - * such a clock base will see the correct offsets. - */ - if (cpu_base->active_bases & bases) + + if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); - else - hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } -- cgit v1.2.3 From 9482fd71dbb8f0d1a61821a83e467dc0a9d7b429 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Aug 2021 22:31:24 +0200 Subject: hrtimer: Use raw_cpu_ptr() in clock_was_set() clock_was_set() can be invoked from preemptible context. Use raw_cpu_ptr() to check whether high resolution mode is active or not. It does not matter whether the task migrates after acquiring the pointer. Fixes: e71a4153b7c2 ("hrtimer: Force clock_was_set() handling for the HIGHRES=n, NOHZ=y case") Reported-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/875ywacsmb.ffs@tglx --- kernel/time/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88aefc35f7d2..33b00e213e07 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -944,10 +944,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active() && !tick_nohz_active) + if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -958,9 +959,9 @@ void clock_was_set(unsigned int bases) /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; + cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) -- cgit v1.2.3 From f80e21489590c00f46226d5802d900e6f66e5633 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Aug 2021 22:32:30 +0200 Subject: hrtimer: Unbreak hrtimer_force_reprogram() Since the recent consoliation of reprogramming functions, hrtimer_force_reprogram() is affected by a check whether the new expiry time is past the current expiry time. This breaks the NOHZ logic as that relies on the fact that the tick hrtimer is moved into the future. That means cpu_base->expires_next becomes stale and subsequent reprogramming attempts fail as well until the situation is cleaned up by an hrtimer interrupts. For some yet unknown reason this leads to a complete stall, so for now partially revert the offending commit to a known working state. The root cause for the stall is still investigated and will be fixed in a subsequent commit. Fixes: b14bca97c9f5 ("hrtimer: Consolidate reprogramming code") Reported-by: Mike Galbraith Reported-by: Marek Szyprowski Signed-off-by: Thomas Gleixner Tested-by: Mike Galbraith Link: https://lore.kernel.org/r/8735recskh.ffs@tglx --- kernel/time/hrtimer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 33b00e213e07..0ea8702eb516 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,24 +652,10 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -static void -__hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal, - struct hrtimer *next_timer, ktime_t expires_next) +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) { - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) - return; - - if (expires_next > cpu_base->expires_next) - return; - - if (skip_equal && expires_next == cpu_base->expires_next) - return; - - cpu_base->next_timer = next_timer; cpu_base->expires_next = expires_next; /* @@ -707,8 +693,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) expires_next = hrtimer_update_next_event(cpu_base); - __hrtimer_reprogram(cpu_base, skip_equal, cpu_base->next_timer, - expires_next); + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ @@ -863,7 +851,19 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; - __hrtimer_reprogram(cpu_base, true, timer, expires); + if (expires >= cpu_base->expires_next) + return; + + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; + + cpu_base->next_timer = timer; + + __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, -- cgit v1.2.3 From ae460fd9164b16654d8ec06cbc280b832f840eac Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 8 Jun 2021 16:43:40 +0100 Subject: clocksource/drivers/exynos_mct: Prioritise Arm arch timer on arm64 All arm64 CPUs feature an architected timer, which offers a relatively low-latency interface to a per-cpu clocksource and timer. For the most part, using this interface is a no-brainer, with the exception of SoCs where it cannot be used to wake up from deep idle state (i.e. CLOCK_EVT_FEAT_C3STOP is set). On the contrary, the Exynos MCT is extremely slow to access yet can be used as a wakeup source. In preparation for using the Exynos MCT as a potential wakeup timer for the Arm architected timer, reduce its ratings so that the architected timer is preferred. This effectively reverts the decision made in 6282edb72bed ("clocksource/drivers/exynos_mct: Increase priority over ARM arch timer") for arm64, as the reasoning for the original change was to work around a 32-bit SoC design. Cc: Marek Szyprowski Cc: Krzysztof Kozlowski Cc: Chanwoo Choi Cc: Thomas Gleixner Signed-off-by: Will Deacon Tested-by: Krzysztof Kozlowski # exynos-5422 Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210608154341.10794-2-will@kernel.org --- drivers/clocksource/exynos_mct.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index fabad79baafc..804d3e01c8f4 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -51,6 +51,15 @@ #define TICK_BASE_CNT 1 +#ifdef CONFIG_ARM +/* Use values higher than ARM arch timer. See 6282edb72bed. */ +#define MCT_CLKSOURCE_RATING 450 +#define MCT_CLKEVENTS_RATING 500 +#else +#define MCT_CLKSOURCE_RATING 350 +#define MCT_CLKEVENTS_RATING 350 +#endif + enum { MCT_INT_SPI, MCT_INT_PPI @@ -206,7 +215,7 @@ static void exynos4_frc_resume(struct clocksource *cs) static struct clocksource mct_frc = { .name = "mct-frc", - .rating = 450, /* use value higher than ARM arch timer */ + .rating = MCT_CLKSOURCE_RATING, .read = exynos4_frc_read, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -457,7 +466,7 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; - evt->rating = 500; /* use value higher than ARM arch timer */ + evt->rating = MCT_CLKEVENTS_RATING, exynos4_mct_write(TICK_BASE_CNT, mevt->base + MCT_L_TCNTB_OFFSET); -- cgit v1.2.3 From 88183788eacb782eb6e1295f1934fb9531b503d6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 8 Jun 2021 16:43:41 +0100 Subject: clocksource/drivers/exynos_mct: Mark MCT device as CLOCK_EVT_FEAT_PERCPU The "mct_tick" is a per-cpu clockevents device. Set the CLOCK_EVT_FEAT_PERCPU feature to prevent e.g. mct_tick0 being unsafely designated as the global broadcast timer and instead treat the device as a per-cpu wakeup timer. Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Krzysztof Kozlowski Signed-off-by: Will Deacon Acked-by: Krzysztof Kozlowski Reviewed-by: Chanwoo Choi Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210608154341.10794-3-will@kernel.org --- drivers/clocksource/exynos_mct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 804d3e01c8f4..5e3e96d3d1b9 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -465,7 +465,8 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) evt->set_state_oneshot = set_state_shutdown; evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; - evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; + evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_PERCPU; evt->rating = MCT_CLKEVENTS_RATING, exynos4_mct_write(TICK_BASE_CNT, mevt->base + MCT_L_TCNTB_OFFSET); -- cgit v1.2.3 From faa186adbd06f3e7113ae1dc6766e2273d5d9231 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Thu, 6 May 2021 08:11:36 -0300 Subject: dt-bindings: timer: convert rockchip,rk-timer.txt to YAML Convert Rockchip Timer dt-bindings to YAML. Signed-off-by: Ezequiel Garcia Reviewed-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210506111136.3941-4-ezequiel@collabora.com --- .../bindings/timer/rockchip,rk-timer.txt | 27 --------- .../bindings/timer/rockchip,rk-timer.yaml | 64 ++++++++++++++++++++++ 2 files changed, 64 insertions(+), 27 deletions(-) delete mode 100644 Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt create mode 100644 Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt deleted file mode 100644 index d65fdce7c7f0..000000000000 --- a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt +++ /dev/null @@ -1,27 +0,0 @@ -Rockchip rk timer - -Required properties: -- compatible: should be: - "rockchip,rv1108-timer", "rockchip,rk3288-timer": for Rockchip RV1108 - "rockchip,rk3036-timer", "rockchip,rk3288-timer": for Rockchip RK3036 - "rockchip,rk3066-timer", "rockchip,rk3288-timer": for Rockchip RK3066 - "rockchip,rk3188-timer", "rockchip,rk3288-timer": for Rockchip RK3188 - "rockchip,rk3228-timer", "rockchip,rk3288-timer": for Rockchip RK3228 - "rockchip,rk3229-timer", "rockchip,rk3288-timer": for Rockchip RK3229 - "rockchip,rk3288-timer": for Rockchip RK3288 - "rockchip,rk3368-timer", "rockchip,rk3288-timer": for Rockchip RK3368 - "rockchip,rk3399-timer": for Rockchip RK3399 -- reg: base address of the timer register starting with TIMERS CONTROL register -- interrupts: should contain the interrupts for Timer0 -- clocks : must contain an entry for each entry in clock-names -- clock-names : must include the following entries: - "timer", "pclk" - -Example: - timer: timer@ff810000 { - compatible = "rockchip,rk3288-timer"; - reg = <0xff810000 0x20>; - interrupts = ; - clocks = <&xin24m>, <&cru PCLK_TIMER>; - clock-names = "timer", "pclk"; - }; diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml new file mode 100644 index 000000000000..e26ecb5893ae --- /dev/null +++ b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/rockchip,rk-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip Timer Device Tree Bindings + +maintainers: + - Daniel Lezcano + +properties: + compatible: + oneOf: + - const: rockchip,rk3288-timer + - const: rockchip,rk3399-timer + - items: + - enum: + - rockchip,rv1108-timer + - rockchip,rk3036-timer + - rockchip,rk3066-timer + - rockchip,rk3188-timer + - rockchip,rk3228-timer + - rockchip,rk3229-timer + - rockchip,rk3288-timer + - rockchip,rk3368-timer + - rockchip,px30-timer + - const: rockchip,rk3288-timer + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + minItems: 2 + maxItems: 2 + + clock-names: + items: + - const: pclk + - const: timer + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + + timer: timer@ff810000 { + compatible = "rockchip,rk3288-timer"; + reg = <0xff810000 0x20>; + interrupts = ; + clocks = <&cru PCLK_TIMER>, <&xin24m>; + clock-names = "pclk", "timer"; + }; -- cgit v1.2.3 From be83c3b6e7b8ff22f72827a613bf6f3aa5afadbb Mon Sep 17 00:00:00 2001 From: Phong Hoang Date: Thu, 22 Apr 2021 14:34:43 +0200 Subject: clocksource/drivers/sh_cmt: Fix wrong setting if don't request IRQ for clock source channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CMT instance has at least two channels, one channel will be used as a clock source and another one used as a clock event device. In that case, IRQ is not requested for clock source channel so sh_cmt_clock_event_program_verify() might work incorrectly. Besides, when a channel is only used for clock source, don't need to re-set the next match_value since it should be maximum timeout as it still is. On the other hand, due to no IRQ, total_cycles is not counted up when reaches compare match time (timer counter resets to zero), so sh_cmt_clocksource_read() returns unexpected value. Therefore, use 64-bit clocksoure's mask for 32-bit or 16-bit variants will also lead to wrong delta calculation. Hence, this mask should correspond to timer counter width, and above function just returns the raw value of timer counter register. Fixes: bfa76bb12f23 ("clocksource: sh_cmt: Request IRQ for clock event device only") Fixes: 37e7742c55ba ("clocksource/drivers/sh_cmt: Fix clocksource width for 32-bit machines") Signed-off-by: Phong Hoang Signed-off-by: Niklas Söderlund Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422123443.73334-1-niklas.soderlund+renesas@ragnatech.se --- drivers/clocksource/sh_cmt.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index d7ed99f0001f..dd0956ad969c 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -579,7 +579,8 @@ static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) ch->flags |= flag; /* setup timeout if no clockevent */ - if ((flag == FLAG_CLOCKSOURCE) && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && + flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) __sh_cmt_set_next(ch, ch->max_match_value); out: raw_spin_unlock_irqrestore(&ch->lock, flags); @@ -621,20 +622,25 @@ static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) static u64 sh_cmt_clocksource_read(struct clocksource *cs) { struct sh_cmt_channel *ch = cs_to_sh_cmt(cs); - unsigned long flags; u32 has_wrapped; - u64 value; - u32 raw; - raw_spin_lock_irqsave(&ch->lock, flags); - value = ch->total_cycles; - raw = sh_cmt_get_counter(ch, &has_wrapped); + if (ch->cmt->num_channels == 1) { + unsigned long flags; + u64 value; + u32 raw; - if (unlikely(has_wrapped)) - raw += ch->match_value + 1; - raw_spin_unlock_irqrestore(&ch->lock, flags); + raw_spin_lock_irqsave(&ch->lock, flags); + value = ch->total_cycles; + raw = sh_cmt_get_counter(ch, &has_wrapped); + + if (unlikely(has_wrapped)) + raw += ch->match_value + 1; + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return value + raw; + } - return value + raw; + return sh_cmt_get_counter(ch, &has_wrapped); } static int sh_cmt_clocksource_enable(struct clocksource *cs) @@ -697,7 +703,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, cs->disable = sh_cmt_clocksource_disable; cs->suspend = sh_cmt_clocksource_suspend; cs->resume = sh_cmt_clocksource_resume; - cs->mask = CLOCKSOURCE_MASK(sizeof(u64) * 8); + cs->mask = CLOCKSOURCE_MASK(ch->cmt->info->width); cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; dev_info(&ch->cmt->pdev->dev, "ch%u: used as clock source\n", -- cgit v1.2.3 From 3b87265d825a2d29eb6b67511f0e7ed62225cd97 Mon Sep 17 00:00:00 2001 From: "周琰杰 (Zhou Yanjie)" Date: Fri, 30 Jul 2021 17:43:08 +0800 Subject: clocksource/drivers/ingenic: Use bitfield macro helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use "FIELD_GET()" and "FIELD_PREP()" to simplify the code. [dlezcano] : Changed title Signed-off-by: 周琰杰 (Zhou Yanjie) Reviewed-by: Paul Cercueil Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1627638188-116163-1-git-send-email-zhouyanjie@wanyeetech.com --- drivers/clocksource/ingenic-sysost.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index a129840f14f9..cb6fc2f152d4 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -4,6 +4,7 @@ * Copyright (c) 2020 周琰杰 (Zhou Yanjie) */ +#include #include #include #include @@ -34,8 +35,6 @@ /* bits within the OSTCCR register */ #define OSTCCR_PRESCALE1_MASK 0x3 #define OSTCCR_PRESCALE2_MASK 0xc -#define OSTCCR_PRESCALE1_LSB 0 -#define OSTCCR_PRESCALE2_LSB 2 /* bits within the OSTCR register */ #define OSTCR_OST1CLR BIT(0) @@ -98,7 +97,7 @@ static unsigned long ingenic_ost_percpu_timer_recalc_rate(struct clk_hw *hw, prescale = readl(ost_clk->ost->base + info->ostccr_reg); - prescale = (prescale & OSTCCR_PRESCALE1_MASK) >> OSTCCR_PRESCALE1_LSB; + prescale = FIELD_GET(OSTCCR_PRESCALE1_MASK, prescale); return parent_rate >> (prescale * 2); } @@ -112,7 +111,7 @@ static unsigned long ingenic_ost_global_timer_recalc_rate(struct clk_hw *hw, prescale = readl(ost_clk->ost->base + info->ostccr_reg); - prescale = (prescale & OSTCCR_PRESCALE2_MASK) >> OSTCCR_PRESCALE2_LSB; + prescale = FIELD_GET(OSTCCR_PRESCALE2_MASK, prescale); return parent_rate >> (prescale * 2); } @@ -151,7 +150,8 @@ static int ingenic_ost_percpu_timer_set_rate(struct clk_hw *hw, unsigned long re int val; val = readl(ost_clk->ost->base + info->ostccr_reg); - val = (val & ~OSTCCR_PRESCALE1_MASK) | (prescale << OSTCCR_PRESCALE1_LSB); + val &= ~OSTCCR_PRESCALE1_MASK; + val |= FIELD_PREP(OSTCCR_PRESCALE1_MASK, prescale); writel(val, ost_clk->ost->base + info->ostccr_reg); return 0; @@ -166,7 +166,8 @@ static int ingenic_ost_global_timer_set_rate(struct clk_hw *hw, unsigned long re int val; val = readl(ost_clk->ost->base + info->ostccr_reg); - val = (val & ~OSTCCR_PRESCALE2_MASK) | (prescale << OSTCCR_PRESCALE2_LSB); + val &= ~OSTCCR_PRESCALE2_MASK; + val |= FIELD_PREP(OSTCCR_PRESCALE2_MASK, prescale); writel(val, ost_clk->ost->base + info->ostccr_reg); return 0; -- cgit v1.2.3 From ce9570657d45d6387a68d7f419fe70d085200a2f Mon Sep 17 00:00:00 2001 From: Fengquan Chen Date: Fri, 9 Apr 2021 17:22:42 +0800 Subject: clocksource/drivers/mediatek: Optimize systimer irq clear flow on shutdown mtk_syst_clkevt_shutdown is called after irq disabled in suspend flow, clear any pending systimer irq when shutdown to avoid suspend aborted due to timer irq pending Also as for systimer in mediatek socs, there must be firstly enable timer before clear systimer irq Fixes: e3af677607d9("clocksource/drivers/timer-mediatek: Add support for system timer") Signed-off-by: Fengquan Chen Tested-by: Hsin-Yi Wang Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1617960162-1988-2-git-send-email-Fengquan.Chen@mediatek.com --- drivers/clocksource/timer-mediatek.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-mediatek.c b/drivers/clocksource/timer-mediatek.c index ab63b95e414f..7bcb4a3f26fb 100644 --- a/drivers/clocksource/timer-mediatek.c +++ b/drivers/clocksource/timer-mediatek.c @@ -60,9 +60,9 @@ * SYST_CON_EN: Clock enable. Shall be set to * - Start timer countdown. * - Allow timeout ticks being updated. - * - Allow changing interrupt functions. + * - Allow changing interrupt status,like clear irq pending. * - * SYST_CON_IRQ_EN: Set to allow interrupt. + * SYST_CON_IRQ_EN: Set to enable interrupt. * * SYST_CON_IRQ_CLR: Set to clear interrupt. */ @@ -75,6 +75,7 @@ static void __iomem *gpt_sched_reg __read_mostly; static void mtk_syst_ack_irq(struct timer_of *to) { /* Clear and disable interrupt */ + writel(SYST_CON_EN, SYST_CON_REG(to)); writel(SYST_CON_IRQ_CLR | SYST_CON_EN, SYST_CON_REG(to)); } @@ -111,6 +112,9 @@ static int mtk_syst_clkevt_next_event(unsigned long ticks, static int mtk_syst_clkevt_shutdown(struct clock_event_device *clkevt) { + /* Clear any irq */ + mtk_syst_ack_irq(to_timer_of(clkevt)); + /* Disable timer */ writel(0, SYST_CON_REG(to_timer_of(clkevt))); -- cgit v1.2.3 From 3a95de59730eb9ac8dd6a367018f5653a873ecaa Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 25 Jul 2021 00:44:23 +0200 Subject: clocksource/drivers/fttmr010: Pass around less pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just pass bool flags from the different initcalls and use the flags to set the right pointers. This results in less pointers passed around in init. Cc: Cédric Le Goater Cc: Joel Stanley Signed-off-by: Linus Walleij Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210724224424.2085404-1-linus.walleij@linaro.org --- drivers/clocksource/timer-fttmr010.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/clocksource/timer-fttmr010.c b/drivers/clocksource/timer-fttmr010.c index edb1d5f193f5..126fb1f259b2 100644 --- a/drivers/clocksource/timer-fttmr010.c +++ b/drivers/clocksource/timer-fttmr010.c @@ -271,9 +271,7 @@ static irqreturn_t ast2600_timer_interrupt(int irq, void *dev_id) } static int __init fttmr010_common_init(struct device_node *np, - bool is_aspeed, - int (*timer_shutdown)(struct clock_event_device *), - irq_handler_t irq_handler) + bool is_aspeed, bool is_ast2600) { struct fttmr010 *fttmr010; int irq; @@ -374,8 +372,6 @@ static int __init fttmr010_common_init(struct device_node *np, fttmr010->tick_rate); } - fttmr010->timer_shutdown = timer_shutdown; - /* * Setup clockevent timer (interrupt-driven) on timer 1. */ @@ -383,8 +379,18 @@ static int __init fttmr010_common_init(struct device_node *np, writel(0, fttmr010->base + TIMER1_LOAD); writel(0, fttmr010->base + TIMER1_MATCH1); writel(0, fttmr010->base + TIMER1_MATCH2); - ret = request_irq(irq, irq_handler, IRQF_TIMER, - "FTTMR010-TIMER1", &fttmr010->clkevt); + + if (is_ast2600) { + fttmr010->timer_shutdown = ast2600_timer_shutdown; + ret = request_irq(irq, ast2600_timer_interrupt, + IRQF_TIMER, "FTTMR010-TIMER1", + &fttmr010->clkevt); + } else { + fttmr010->timer_shutdown = fttmr010_timer_shutdown; + ret = request_irq(irq, fttmr010_timer_interrupt, + IRQF_TIMER, "FTTMR010-TIMER1", + &fttmr010->clkevt); + } if (ret) { pr_err("FTTMR010-TIMER1 no IRQ\n"); goto out_unmap; @@ -432,23 +438,17 @@ out_disable_clock: static __init int ast2600_timer_init(struct device_node *np) { - return fttmr010_common_init(np, true, - ast2600_timer_shutdown, - ast2600_timer_interrupt); + return fttmr010_common_init(np, true, true); } static __init int aspeed_timer_init(struct device_node *np) { - return fttmr010_common_init(np, true, - fttmr010_timer_shutdown, - fttmr010_timer_interrupt); + return fttmr010_common_init(np, true, false); } static __init int fttmr010_timer_init(struct device_node *np) { - return fttmr010_common_init(np, false, - fttmr010_timer_shutdown, - fttmr010_timer_interrupt); + return fttmr010_common_init(np, false, false); } TIMER_OF_DECLARE(fttmr010, "faraday,fttmr010", fttmr010_timer_init); -- cgit v1.2.3 From f196ae282070d798c9144771db65577910d58566 Mon Sep 17 00:00:00 2001 From: "周琰杰 (Zhou Yanjie)" Date: Fri, 16 Jul 2021 01:36:45 +0800 Subject: dt-bindings: timer: Add ABIs for new Ingenic SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.Add OST_CLK_EVENT_TIMER for new XBurst®1 SoCs. 2.Add OST_CLK_EVENT_TIMER0 to OST_CLK_EVENT_TIMER15 for new XBurst®2 SoCs. Signed-off-by: 周琰杰 (Zhou Yanjie) Acked-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1626370605-120775-1-git-send-email-zhouyanjie@wanyeetech.com --- include/dt-bindings/clock/ingenic,sysost.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/dt-bindings/clock/ingenic,sysost.h b/include/dt-bindings/clock/ingenic,sysost.h index 063791b01ab3..d7aa42c08ded 100644 --- a/include/dt-bindings/clock/ingenic,sysost.h +++ b/include/dt-bindings/clock/ingenic,sysost.h @@ -13,4 +13,23 @@ #define OST_CLK_PERCPU_TIMER2 3 #define OST_CLK_PERCPU_TIMER3 4 +#define OST_CLK_EVENT_TIMER 1 + +#define OST_CLK_EVENT_TIMER0 0 +#define OST_CLK_EVENT_TIMER1 1 +#define OST_CLK_EVENT_TIMER2 2 +#define OST_CLK_EVENT_TIMER3 3 +#define OST_CLK_EVENT_TIMER4 4 +#define OST_CLK_EVENT_TIMER5 5 +#define OST_CLK_EVENT_TIMER6 6 +#define OST_CLK_EVENT_TIMER7 7 +#define OST_CLK_EVENT_TIMER8 8 +#define OST_CLK_EVENT_TIMER9 9 +#define OST_CLK_EVENT_TIMER10 10 +#define OST_CLK_EVENT_TIMER11 11 +#define OST_CLK_EVENT_TIMER12 12 +#define OST_CLK_EVENT_TIMER13 13 +#define OST_CLK_EVENT_TIMER14 14 +#define OST_CLK_EVENT_TIMER15 15 + #endif /* __DT_BINDINGS_CLOCK_INGENIC_OST_H__ */ -- cgit v1.2.3 From d25a025201ed98f4b93775e0999a3f2135702106 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Aug 2021 09:31:28 -0700 Subject: clocksource: Make clocksource watchdog test safe for slow-HZ systems The clocksource watchdog test sets a local JIFFIES_SHIFT macro and assumes that HZ is >= 100. For smaller HZ values this shift value is too large and causes undefined behaviour. Move the HZ-based definitions of JIFFIES_SHIFT from kernel/time/jiffies.c to kernel/time/tick-internal.h so the clocksource watchdog test can utilize them, which makes it work correctly with all HZ values. [ tglx: Resolved conflicts and massaged changelog ] Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/20210812000133.GA402890@paulmck-ThinkPad-P17-Gen-1/ --- kernel/time/clocksource-wdtest.c | 5 ++--- kernel/time/jiffies.c | 21 +-------------------- kernel/time/tick-internal.h | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 01df12395c0e..df922f49d171 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -19,6 +19,8 @@ #include #include +#include "tick-internal.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -34,9 +36,6 @@ static u64 wdtest_jiffies_read(struct clocksource *cs) return (u64)jiffies; } -/* Assume HZ > 100. */ -#define JIFFIES_SHIFT 8 - static struct clocksource clocksource_wdtest_jiffies = { .name = "wdtest-jiffies", .rating = 1, /* lowest valid rating*/ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 01935aafdb46..bc4db9e5ab70 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -10,28 +10,9 @@ #include #include "timekeeping.h" +#include "tick-internal.h" -/* Since jiffies uses a simple TICK_NSEC multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. TICK_NSEC grows as HZ - * shrinks, so values greater than 8 overflow 32bits when - * HZ=100. - */ -#if HZ < 34 -#define JIFFIES_SHIFT 6 -#elif HZ < 67 -#define JIFFIES_SHIFT 7 -#else -#define JIFFIES_SHIFT 8 -#endif - static u64 jiffies_read(struct clocksource *cs) { return (u64) jiffies; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 3548f0829e6d..649f2b48e8f0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -177,3 +177,23 @@ void clock_was_set(unsigned int bases); void clock_was_set_delayed(void); void hrtimers_resume_local(void); + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif -- cgit v1.2.3