From 8038dad7e888581266c76df15d70ca457a3c5910 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Feb 2015 10:34:39 -0800 Subject: smpboot: Add common code for notification from dying CPU RCU ignores offlined CPUs, so they cannot safely run RCU read-side code. (They -can- use SRCU, but not RCU.) This means that any use of RCU during or after the call to arch_cpu_idle_dead(). Unfortunately, commit 2ed53c0d6cc99 added a complete() call, which will contain RCU read-side critical sections if there is a task waiting to be awakened. Which, as it turns out, there almost never is. In my qemu/KVM testing, the to-be-awakened task is not yet asleep more than 99.5% of the time. In current mainline, failure is even harder to reproduce, requiring a virtualized environment that delays the outgoing CPU by at least three jiffies between the time it exits its stop_machine() task at CPU_DYING time and the time it calls arch_cpu_idle_dead() from the idle loop. However, this problem really can occur, especially in virtualized environments, and therefore really does need to be fixed This suggests moving back to the polling loop, but using a much shorter wait, with gentle exponential backoff instead of the old 100-millisecond wait. Most of the time, the loop will exit without waiting at all, and almost all of the remaining uses will wait only five microseconds. If the outgoing CPU is preempted, a loop will wait one jiffy, then increase the wait by a factor of 11/10ths, rounding up. As before, there is a five-second timeout. This commit therefore provides common-code infrastructure to do the dying-to-surviving CPU handoff in a safe manner. This code also provides an indication at CPU-online of whether the CPU to be onlined previously timed out on offline. The new cpu_check_up_prepare() function returns -EBUSY if this CPU previously took more than five seconds to go offline, or -EAGAIN if it has not yet managed to go offline. The rationale for -EAGAIN is that it might still be preempted, so an additional wait might well find it correctly offlined. Architecture-specific code can decide how to handle these conditions. Systems in which CPUs take themselves completely offline might respond to an -EBUSY return as if it was a zero (success) return. Systems in which the surviving CPU must take some action might take it at this time, or might simply mark the other CPU as unusable. Note that architectures that take the easy way out and simply pass the -EBUSY and -EAGAIN upwards will change the sysfs API. Signed-off-by: Paul E. McKenney Cc: Cc: [ paulmck: Fixed state machine for architectures that don't check earlier CPU-hotplug results as suggested by James Hogan. ] --- include/linux/cpu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/cpu.h') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4260e8594bd7..4744ef915acd 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -95,6 +95,8 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ +#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, + * perhaps due to preemption. */ /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend * operation in progress @@ -271,4 +273,14 @@ void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); +DECLARE_PER_CPU(bool, cpu_dead_idle); + +int cpu_report_state(int cpu); +int cpu_check_up_prepare(int cpu); +void cpu_set_state_online(int cpu); +#ifdef CONFIG_HOTPLUG_CPU +bool cpu_wait_death(unsigned int cpu, int seconds); +bool cpu_report_death(void); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #endif /* _LINUX_CPU_H_ */ -- cgit v1.2.3 From 88428cc5c27c63a4313e213813bc39b9899224d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jan 2015 14:42:09 -0800 Subject: rcu: Handle outgoing CPUs on exit from idle loop This commit informs RCU of an outgoing CPU just before that CPU invokes arch_cpu_idle_dead() during its last pass through the idle loop (via a new CPU_DYING_IDLE notifier value). This change means that RCU need not deal with outgoing CPUs passing through the scheduler after informing RCU that they are no longer online. Note that removing the CPU from the rcu_node ->qsmaskinit bit masks is done at CPU_DYING_IDLE time, and orphaning callbacks is still done at CPU_DEAD time, the reason being that at CPU_DEAD time we have another CPU that can adopt them. Signed-off-by: Paul E. McKenney --- include/linux/cpu.h | 2 ++ include/linux/rcupdate.h | 2 ++ kernel/rcu/tree.c | 41 +++++++++++++++++++++++++++++++---------- kernel/sched/idle.c | 2 ++ 4 files changed, 37 insertions(+), 10 deletions(-) (limited to 'include/linux/cpu.h') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4744ef915acd..d028721748d4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -95,6 +95,8 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ +#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached + * idle loop. */ #define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 78097491cd99..762022f07afd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -266,6 +266,8 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79d53399247e..d5247ed44004 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2475,6 +2475,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -2485,7 +2505,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2498,13 +2517,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2520,6 +2532,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) { } +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -3733,8 +3749,8 @@ static void rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3760,6 +3776,11 @@ static int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; + case CPU_DYING_IDLE: + for_each_rcu_flavor(rsp) { + rcu_cleanup_dying_idle_cpu(cpu, rsp); + } + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index e99e361ade20..b0090accfb5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -225,6 +225,8 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { + rcu_cpu_notify(NULL, CPU_DYING_IDLE, + (void *)(long)smp_processor_id()); smp_mb(); /* all activity before dead. */ this_cpu_write(cpu_dead_idle, true); arch_cpu_idle_dead(); -- cgit v1.2.3