From 48d07c04b4cc1dc1221965312f58fd84926212fe Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Mar 2019 22:13:33 +0100 Subject: rcu: Enable elimination of Tree-RCU softirq processing Some workloads need to change kthread priority for RCU core processing without affecting other softirq work. This commit therefore introduces the rcutree.use_softirq kernel boot parameter, which moves the RCU core work from softirq to a per-CPU SCHED_OTHER kthread named rcuc. Use of SCHED_OTHER approach avoids the scalability problems that appeared with the earlier attempt to move RCU core processing to from softirq to kthreads. That said, kernels built with RCU_BOOST=y will run the rcuc kthreads at the RCU-boosting priority. Note that rcutree.use_softirq=0 must be specified to move RCU core processing to the rcuc kthreads: rcutree.use_softirq=1 is the default. Reported-by: Thomas Gleixner Tested-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Adjust for invoke_rcu_callbacks() only ever being invoked from RCU core processing, in contrast to softirq->rcuc transition in old mainline RCU priority boosting. ] [ paulmck: Avoid wakeups when scheduler might have invoked rcu_read_unlock() while holding rq or pi locks, also possibly fixing a pre-existing latent bug involving raise_softirq()-induced wakeups. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 138 ++++++++++++++++++++++++++++++++++++++++++----- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 134 +++++---------------------------------------- 3 files changed, 140 insertions(+), 134 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..8e290163505a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -51,6 +51,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" @@ -92,6 +98,9 @@ struct rcu_state rcu_state = { /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); +/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +static bool use_softirq = 1; +module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* Perform RCU core processing work for the current CPU. */ -static __latent_entropy void rcu_core(struct softirq_action *unused) +static __latent_entropy void rcu_core(void) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2295,29 +2304,131 @@ static __latent_entropy void rcu_core(struct softirq_action *unused) trace_rcu_utilization(TPS("End RCU core")); } +static void rcu_core_si(struct softirq_action *h) +{ + rcu_core(); +} + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) + wake_up_process(t); +} + +static void invoke_rcu_core_kthread(void) +{ + struct task_struct *t; + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); + if (t != NULL && t != current) + rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); + local_irq_restore(flags); +} + /* - * Schedule RCU callback invocation. If the running implementation of RCU - * does not support RCU priority boosting, just do a direct call, otherwise - * wake up the per-CPU kernel kthread. Note that because we are running - * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Do RCU callback invocation. Not that if we are running !use_softirq, + * we are already in the rcuc kthread. If callbacks are offloaded, then + * ->cblist is always empty, so we don't get here. Therefore, we only + * ever need to check for the scheduler being operational (some callbacks + * do wakeups, so we do need the scheduler). */ static void invoke_rcu_callbacks(struct rcu_data *rdp) { if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; - if (likely(!rcu_state.boost)) { - rcu_do_batch(rdp); - return; - } - invoke_rcu_callbacks_kthread(); + rcu_do_batch(rdp); } +/* + * Wake up this CPU's rcuc kthread to do RCU core processing. + */ static void invoke_rcu_core(void) { - if (cpu_online(smp_processor_id())) + if (!cpu_online(smp_processor_id())) + return; + if (use_softirq) raise_softirq(RCU_SOFTIRQ); + else + invoke_rcu_core_kthread(); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_data.rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_core(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } + } + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; +} + +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_data.rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn per-CPU RCU core processing kthreads. + */ +static int __init rcu_spawn_core_kthreads(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; + if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + return 0; + WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), + "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); + return 0; } +early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -3355,7 +3466,8 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_core); + if (use_softirq) + open_softirq(RCU_SOFTIRQ, rcu_core_si); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e253d11af3c4..a1a72a1ecb02 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -407,8 +407,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); +static void rcu_cpu_kthread_setup(unsigned int cpu); static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..21611862e083 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -11,29 +11,7 @@ * Paul E. McKenney */ -#include -#include -#include -#include -#include -#include -#include -#include "../time/tick-internal.h" - -#ifdef CONFIG_RCU_BOOST #include "../locking/rtmutex_common.h" -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, - * all uses are in dead code. Provide a definition to keep the compiler - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. - * This probably needs to be excluded from -rt builds. - */ -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) -#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (!use_softirq) + pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); @@ -627,7 +607,7 @@ static void rcu_read_unlock_special(struct task_struct *t) if (preempt_bh_were_disabled || irqs_were_disabled) { WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled) { + if (irqs_were_disabled && use_softirq) { /* Enabling irqs does not reschedule, so... */ raise_softirq_irqoff(RCU_SOFTIRQ); } else { @@ -944,18 +924,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +/* + * If boosting, set rcuc kthreads to realtime priority. + */ +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ #ifdef CONFIG_RCU_BOOST + struct sched_param sp; -static void rcu_wake_cond(struct task_struct *t, int status) -{ - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +#endif /* #ifdef CONFIG_RCU_BOOST */ } +#ifdef CONFIG_RCU_BOOST + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1090,23 +1073,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), - __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - /* * Is the current CPU running the RCU-callbacks kthread? * Caller must have preemption disabled. @@ -1160,59 +1126,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_data.rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces - * the RCU softirq used in configurations of RCU that do not support RCU - * priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_do_batch(this_cpu_ptr(&rcu_data)); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1243,27 +1156,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_data.rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - /* * Spawn boost kthreads -- called as soon as the scheduler is running. */ static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; - int cpu; - for_each_possible_cpu(cpu) - per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) - return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -1286,11 +1185,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - static bool rcu_is_callbacks_kthread(void) { return false; -- cgit v1.2.3 From 23634ebc1d946f19eb112d4455c1d84948875e31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 24 Mar 2019 15:25:51 -0700 Subject: rcu: Check for wakeup-safe conditions in rcu_read_unlock_special() When RCU core processing is offloaded from RCU_SOFTIRQ to the rcuc kthreads, a full and unconditional wakeup is required to initiate RCU core processing. In contrast, when RCU core processing is carried out by RCU_SOFTIRQ, a raise_softirq() suffices. Of course, there are situations where raise_softirq() does a full wakeup, but these do not occur with normal usage of rcu_read_unlock(). The reason that full wakeups can be problematic is that the scheduler sometimes invokes rcu_read_unlock() with its pi or rq locks held, which can of course result in deadlock in CONFIG_PREEMPT=y kernels when rcu_read_unlock() invokes the scheduler. Scheduler invocations can happen in the following situations: (1) The just-ended reader has been subjected to RCU priority boosting, in which case rcu_read_unlock() must deboost, (2) Interrupts were disabled across the call to rcu_read_unlock(), so the quiescent state must be deferred, requiring a wakeup of the rcuc kthread corresponding to the current CPU. Now, the scheduler may hold one of its locks across rcu_read_unlock() only if preemption has been disabled across the entire RCU read-side critical section, which in the days prior to RCU flavor consolidation meant that rcu_read_unlock() never needed to do wakeups. However, this is no longer the case for any but the first rcu_read_unlock() following a condition (e.g., preempted RCU reader) requiring special rcu_read_unlock() attention. For example, an RCU read-side critical section might be preempted, but preemption might be disabled across the rcu_read_unlock(). The rcu_read_unlock() must defer the quiescent state, and therefore leaves the task queued on its leaf rcu_node structure. If a scheduler interrupt occurs, the scheduler might well invoke rcu_read_unlock() with one of its locks held. However, the preempted task is still queued, so rcu_read_unlock() will attempt to defer the quiescent state once more. When RCU core processing is carried out by RCU_SOFTIRQ, this works just fine: The raise_softirq() function simply sets a bit in a per-CPU mask and the RCU core processing will be undertaken upon return from interrupt. Not so when RCU core processing is carried out by the rcuc kthread: In this case, the required wakeup can result in deadlock. The initial solution to this problem was to use set_tsk_need_resched() and set_preempt_need_resched() to force a future context switch, which allows rcu_preempt_note_context_switch() to report the deferred quiescent state to RCU's core processing. Unfortunately for expedited grace periods, there can be a significant delay between the call for a context switch and the actual context switch. This commit therefore introduces a ->deferred_qs flag to the task_struct structure's rcu_special structure. This flag is initially false, and is set to true by the first call to rcu_read_unlock() requiring special attention, then finally reset back to false when the quiescent state is finally reported. Then rcu_read_unlock() attempts full wakeups only when ->deferred_qs is false, that is, on the first rcu_read_unlock() requiring special attention. Note that a chain of RCU readers linked by some other sort of reader may find that a later rcu_read_unlock() is once again able to do a full wakeup, courtesy of an intervening preemption: rcu_read_lock(); /* preempted */ local_irq_disable(); rcu_read_unlock(); /* Can do full wakeup, sets ->deferred_qs. */ rcu_read_lock(); local_irq_enable(); preempt_disable() rcu_read_unlock(); /* Cannot do full wakeup, ->deferred_qs set. */ rcu_read_lock(); preempt_enable(); /* preempted, >deferred_qs reset. */ local_irq_disable(); rcu_read_unlock(); /* Can again do full wakeup, sets ->deferred_qs. */ Such linked RCU readers do not yet seem to appear in the Linux kernel, and it is probably best if they don't. However, RCU needs to handle them, and some variations on this theme could make even raise_softirq() unsafe due to the possibility of its doing a full wakeup. This commit therefore also avoids invoking raise_softirq() when the ->deferred_qs set flag is set. Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior --- include/linux/sched.h | 2 +- kernel/rcu/tree_plugin.h | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 11837410690f..942a44c1b8eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -565,7 +565,7 @@ union rcu_special { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 pad; /* No garbage from compiler! */ + u8 deferred_qs; } b; /* Bits. */ u32 s; /* Set of bits. */ }; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 21611862e083..75110ea75d01 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -455,6 +455,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; @@ -605,16 +606,24 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); - /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled && use_softirq) { - /* Enabling irqs does not reschedule, so... */ + t->rcu_read_unlock_special.b.exp_hint = false; + // Need to defer quiescent state until everything is enabled. + if (irqs_were_disabled && use_softirq && + (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + // Using softirq, safe to awaken, and we get + // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); + } else if (irqs_were_disabled && !use_softirq && + !t->rcu_read_unlock_special.b.deferred_qs) { + // Safe to awaken and we get no help from enabling + // irqs, unlike bh/preempt. + invoke_rcu_core(); } else { - /* Enabling BH or preempt does reschedule, so... */ + // Enabling BH or preempt does reschedule, so... set_tsk_need_resched(current); set_preempt_need_resched(); } + t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } -- cgit v1.2.3 From 25102de65fdd246eb6801114ce6dfa3a076bb678 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 14:12:50 -0700 Subject: rcu: Only do rcu_read_unlock_special() wakeups if expedited Currently, rcu_read_unlock_special() will do wakeups whenever it is safe to do so. However, wakeups are expensive, and they are only really needed when the just-ended RCU read-side critical section is blocking an expedited grace period (in which case speed is of the essence) or on a nohz_full CPU (where it might be a good long time before an interrupt arrives). This commit therefore checks for these conditions, and does the expensive wakeups only if doing so would be useful. Note it can be rather expensive to determine whether or not the current task (as opposed to the current CPU) is blocking the current expedited grace period. Doing so requires traversing the ->blkd_tasks list, which can be quite long. This commit therefore cheats: If the current task is on a given ->blkd_tasks list, and some task on that list is blocking the current expedited grace period, the code assumes that the current task is blocking that expedited grace period. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75110ea75d01..d15cdab6aeb4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -606,20 +606,28 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { + bool exp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + t->rcu_read_unlock_special.b.exp_hint = false; + exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || + (rdp->grpmask & rnp->expmask) || + tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && + if (exp && irqs_were_disabled && use_softirq && (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); - } else if (irqs_were_disabled && !use_softirq && + } else if (exp && irqs_were_disabled && !use_softirq && !t->rcu_read_unlock_special.b.deferred_qs) { // Safe to awaken and we get no help from enabling // irqs, unlike bh/preempt. invoke_rcu_core(); } else { // Enabling BH or preempt does reschedule, so... + // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); } -- cgit v1.2.3 From 385b599e8c04fa843c4d7f785478827cc512d720 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 15:12:47 -0700 Subject: rcu: Allow rcu_read_unlock_special() to raise_softirq() if in_irq() When running in an interrupt handler, raise_softirq() and raise_softirq_irqoff() have extremely low overhead: They simply set a bit in a per-CPU mask, which is checked upon exit from that interrupt handler. Therefore, if rcu_read_unlock_special() is invoked within an interrupt handler and RCU_SOFTIRQ is in use, this commit make use of raise_softirq_irqoff() even if there is no expedited grace period in flight and even if this is not a nohz_full CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d15cdab6aeb4..e1005f5e8094 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -615,7 +615,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if (exp && irqs_were_disabled && use_softirq && + if ((exp || in_irq()) && irqs_were_disabled && use_softirq && (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. -- cgit v1.2.3 From 0864f057b050bc6dd68106b3185e02db5140012d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Apr 2019 12:19:25 -0700 Subject: rcu: Use irq_work to get scheduler's attention in clean context When rcu_read_unlock_special() is invoked with interrupts disabled, is either not in an interrupt handler or is not using RCU_SOFTIRQ, is not the first RCU read-side critical section in the chain, and either there is an expedited grace period in flight or this is a NO_HZ_FULL kernel, the end of the grace period can be unduly delayed. The reason for this is that it is not safe to do wakeups in this situation. This commit fixes this problem by using the irq_work subsystem to force a later interrupt handler in a clean environment. Because set_tsk_need_resched(current) and set_preempt_need_resched() are invoked prior to this, the scheduler will force a context switch upon return from this interrupt (though perhaps at the end of any interrupted preempt-disable or BH-disable region of code), which will invoke rcu_note_context_switch() (again in a clean environment), which will in turn give RCU the chance to report the deferred quiescent state. Of course, by then this task might be within another RCU read-side critical section. But that will be detected at that time and reporting will be further deferred to the outermost rcu_read_unlock(). See rcu_preempt_need_deferred_qs() and rcu_preempt_deferred_qs() for more details on the checking. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a1a72a1ecb02..21d740f0b8dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -161,6 +161,8 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + bool defer_qs_iw_pending; /* Scheduler attention pending? */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e1005f5e8094..58c7853f19e7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -587,6 +587,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) t->rcu_read_lock_nesting += RCU_NEST_BIAS; } +/* + * Minimal handler to give the scheduler a chance to re-evaluate. + */ +static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + + rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + rdp->defer_qs_iw_pending = false; +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -630,6 +641,15 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->defer_qs_iw_pending && exp) { + // Get scheduler to re-evaluate and call hooks. + // If !IRQ_WORK, FQS scan will eventually IPI. + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = true; + irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); + } } t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); -- cgit v1.2.3 From 43e903ad3e0843d03da15d8eaffb5ada22966c76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Mar 2019 08:36:03 -0700 Subject: rcu: Inline invoke_rcu_callbacks() into its sole remaining caller This commit saves a few lines of code by inlining invoke_rcu_callbacks() into its sole remaining caller. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e290163505a..7822a2e1370d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -147,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -2296,8 +2295,9 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rdp); + if (rcu_segcblist_ready_cbs(&rdp->cblist) && + likely(READ_ONCE(rcu_scheduler_fully_active))) + rcu_do_batch(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); @@ -2332,20 +2332,6 @@ static void invoke_rcu_core_kthread(void) local_irq_restore(flags); } -/* - * Do RCU callback invocation. Not that if we are running !use_softirq, - * we are already in the rcuc kthread. If callbacks are offloaded, then - * ->cblist is always empty, so we don't get here. Therefore, we only - * ever need to check for the scheduler being operational (some callbacks - * do wakeups, so we do need the scheduler). - */ -static void invoke_rcu_callbacks(struct rcu_data *rdp) -{ - if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) - return; - rcu_do_batch(rdp); -} - /* * Wake up this CPU's rcuc kthread to do RCU core processing. */ -- cgit v1.2.3 From b9ad4d6ed18e23b0ff6a824b925a1278625d5345 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 09:09:47 -0700 Subject: rcu: Avoid self-IPI in sync_rcu_exp_select_node_cpus() Although sync_rcu_exp_select_node_cpus() treats the current CPU as being in a quiescent state, it might well migrate to some other CPU before reaching the smp_call_function_single(), which could then result in an unnecessary simulated self-IPI. This commit therefore instead simply refuses to invoke smp_call_function_single() on the current CPU, which causes the later rcu_report_exp_cpu_mult() to report this CPU's quiescent state with less overhead. This also reduces the rcu_exp_handler() function's state space by removing the direct call that this smp_call_function_single() uses to emulate the requested self-IPI. Signed-off-by: Paul E. McKenney [ paulmck: Use get_cpu() instead of preempt_disable() per Joel Fernandes. ] --- kernel/rcu/tree_exp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..5390618787b6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -384,7 +384,12 @@ retry_ipi: mask_ofl_test |= mask; continue; } + if (get_cpu() == cpu) { + put_cpu(); + continue; + } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); if (!ret) { mask_ofl_ipi &= ~mask; continue; -- cgit v1.2.3 From e015a341122024198f57d1f0498a776523137e94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 10:03:12 -0700 Subject: rcu: Avoid self-IPI in sync_sched_exp_online_cleanup() The sync_sched_exp_online_cleanup() is invoked at online time to handle the case where the start of an expedited grace period ran concurrently with a CPU being taken offline and then immediately being placed online. It checks to see if RCU needs an expedited quiescent state from the incoming CPU, sending it an IPI if so. However, it is quite possible that sync_sched_exp_online_cleanup() is running on that CPU, in which case it is considerably less overhead to simply request the quiescent state locally instead of simulating a self-IPI. This commit therefore places the last few lines of rcu_exp_handler() into a new rcu_exp_need_qs() function, which is invoked both by rcu_exp_handler() and by sync_sched_exp_online_cleanup() in the self-IPI case. This also reduces the rcu_exp_handler() function's state space by removing the direct call that this smp_call_function_single() uses to emulate the requested self-IPI. This in turn will allow tighter error checking in rcu_is_cpu_rrupt_from_idle(). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree_exp.h | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 5390618787b6..de1b4acf6979 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -699,6 +699,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -714,25 +724,38 @@ static void rcu_exp_handler(void *unused) rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + rcu_exp_need_qs(); } /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ static void sync_sched_exp_online_cleanup(int cpu) { + unsigned long flags; + int my_cpu; struct rcu_data *rdp; int ret; struct rcu_node *rnp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + my_cpu = get_cpu(); + /* Quiescent state either not needed or already requested, leave. */ + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + put_cpu(); return; + } + /* Quiescent state needed on current CPU, so set it up locally. */ + if (my_cpu == cpu) { + local_irq_save(flags); + rcu_exp_need_qs(); + local_irq_restore(flags); + put_cpu(); + return; + } + /* Quiescent state needed on some other CPU, send IPI. */ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); WARN_ON_ONCE(ret); } -- cgit v1.2.3 From eddded80121f2a7bda810f65bf7cb648a709ed11 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 26 Mar 2019 15:24:09 -0400 Subject: rcu: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle() It would be good to combine the dynticks and dynticks_nesting counters in order to simplify the code. Unfortunately, there are concerns about usermode upcalls appearing to RCU as half of an interrupt, as Byungchul learned [1]. The "half" in "half interrupt" is due to an unpaired rcu_irq_enter(): Normally, each rcu_irq_enter() has a later call to rcu_irq_exit(). Out of an abundance of caution, Paul added warnings [2] in the RCU code which if not fired by 2021 will be interpreted as meaning that this half-interrupt scenario cannot happen any more, thus permitting simplification of this code. In the meantime, this commit makes the following changes: (1) Combining these two counters requires that rcu_rrupt_from_idle() is invoked only from hard-interrupt contexts as discussed here [3]. This commit therefore adds the required lockdep_assert_in_irq() to check this constraint. (2) Furthermore, rcu_rrupt_from_idle() is not explicit about how it is using the counters which can lead to weird future bugs. This commit therefore adds comments indicating the meaning and use of each counter. (3) Lastly, this commit checks for counter underflows as another check that half interrupts don't occur. (Previously, the function would simply return true upon underflow.) All these checks checks are NOOPs if PROVE_LOCKING (and thus PROVE_RCU) are disabled. [1] https://lore.kernel.org/patchwork/patch/952349/ [2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts") [3] https://lore.kernel.org/lkml/20190312150514.GB249405@google.com/ Cc: byungchul.park@lge.com Cc: kernel-team@android.com Cc: rcu@vger.kernel.org Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7822a2e1370d..b9629cf08f94 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -376,16 +376,29 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void) } /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle * - * If the current CPU is idle or running at a first-level (not nested) + * If the current CPU is idle and running at a first-level (not nested) * interrupt from idle, return true. The caller must have at least * disabled preemption. */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; + /* Called only from within the scheduling-clock interrupt */ + lockdep_assert_in_irq(); + + /* Check for counter underflows */ + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + "RCU dynticks_nesting counter underflow!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + "RCU dynticks_nmi_nesting counter underflow/zero!"); + + /* Are we at first interrupt nesting level? */ + if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + return false; + + /* Does CPU appear to be idle from an RCU standpoint? */ + return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ -- cgit v1.2.3 From 1bb336443cde1154600bd147a45a30baa59c57db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 15:51:25 -0700 Subject: rcu: Rename rcu_data's ->deferred_qs to ->exp_deferred_qs The rcu_data structure's ->deferred_qs field is used to indicate that the current CPU is blocking an expedited grace period (perhaps a future one). Given that it is used only for expedited grace periods, its current name is misleading, so this commit renames it to ->exp_deferred_qs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_exp.h | 8 ++++---- kernel/rcu/tree_plugin.h | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 21d740f0b8dc..7acaf3a62d39 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -154,7 +154,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool deferred_qs; /* This CPU awaiting a deferred QS? */ + bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index de1b4acf6979..e0c928d04be5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->deferred_qs, false); + WRITE_ONCE(rdp->exp_deferred_qs, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -616,7 +616,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -638,7 +638,7 @@ static void rcu_exp_handler(void *unused) if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -661,7 +661,7 @@ static void rcu_exp_handler(void *unused) * * Otherwise, force a context switch after the CPU enables everything. */ - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { rcu_preempt_deferred_qs(t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 58c7853f19e7..1aeb4ae187ce 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -237,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->deferred_qs); + WARN_ON_ONCE(rdp->exp_deferred_qs); } /* @@ -337,7 +337,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->deferred_qs) + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -451,7 +451,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->deferred_qs) { + if (!special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -459,7 +459,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { + if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -471,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->deferred_qs) { + if (rdp->exp_deferred_qs) { rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); @@ -560,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.deferred_qs) || + return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } -- cgit v1.2.3 From f0b635627395223d3c60a3105372b4349e04772f Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 09:21:55 +0800 Subject: rcu: Remove unused rdp local from synchronize_rcu_expedited() Because rdp is initialized but never used in synchronize_rcu_expedited(), this commit removes it. Signed-off-by: Jiang Biao Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e0c928d04be5..8e539710721a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -793,7 +793,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - struct rcu_data *rdp; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -830,7 +829,6 @@ void synchronize_rcu_expedited(void) } /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); -- cgit v1.2.3 From cd6d17b4a4646d4bf2568f3a4de13a5a13e2ed28 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 29 Mar 2019 15:25:52 +0530 Subject: rcu: Dump specified number of blocked tasks The dump_blkd_tasks() function dumps at most 10 blocked tasks, ignoring the value of the ncheck parameter. This commit therefore substitutes the value of ncheck for the hard-coded value of 10. Because all callers currently pass 10 as the number, this patch does not change behavior, but it is clearly an accident waiting to happen. Signed-off-by: Neeraj Upadhyay Reviewed-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..3a9891a74ead 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -760,7 +760,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); - if (++i >= 10) + if (++i >= ncheck) break; } pr_cont("\n"); -- cgit v1.2.3 From 3ae976a7e3e87438b8439a01aeb79d4866b1c444 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 29 Mar 2019 16:57:08 +0530 Subject: rcu: Correctly unlock root node in rcu_check_gp_start_stall() On systems whose rcu_node tree has only one node, the rcu_check_gp_start_stall() function's values of rnp and rnp_root will be identical. In this case, it clearly does not make sense to release both rnp->lock and rnp_root->lock, but that is exactly what this function does in the last early exit. This commit therefore unlocks only rnp->lock when rnp and rnp_root are equal. Signed-off-by: Neeraj Upadhyay Reviewed-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index f65a73a97323..065183391f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, time_before(j, rcu_state.gp_req_activity + gpssdelay) || time_before(j, rcu_state.gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + if (rnp_root != rnp) + /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp_root); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } -- cgit v1.2.3 From d5a9a8c3bc8068f2e5dfba30150ac09b596b461a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Apr 2019 17:01:39 -0700 Subject: rcu: Set a maximum limit for back-to-back callback invocation Currently, if a CPU has more than 10,000 callbacks pending, it will increase rdp->blimit to LONG_MAX. If you are lucky, LONG_MAX is only about two billion, but this is still a bit too many callbacks to invoke back-to-back while otherwise ignoring the world. This commit therefore sets a maximum limit of DEFAULT_MAX_RCU_BLIMIT, which is set to 10,000, for rdp->blimit. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..f888a76673da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -380,7 +380,8 @@ static int rcu_is_cpu_rrupt_from_idle(void) __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ +#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ static long blimit = DEFAULT_RCU_BLIMIT; #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ static long qhimark = DEFAULT_RCU_QHIMARK; @@ -2113,7 +2114,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); - if (rdp->blimit == LONG_MAX && count <= qlowmark) + if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ @@ -2354,7 +2355,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; + rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) rcu_force_quiescent_state(); -- cgit v1.2.3 From fe15b50cdeeebd9248bf27e3c31278668f08bc04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Apr 2019 16:15:00 -0700 Subject: srcu: Allocate per-CPU data for DEFINE_SRCU() in modules Adding DEFINE_SRCU() or DEFINE_STATIC_SRCU() to a loadable module requires that the size of the reserved region be increased, which is not something we want to be doing all that often. One approach would be to require that loadable modules define an srcu_struct and invoke init_srcu_struct() from their module_init function and cleanup_srcu_struct() from their module_exit function. However, this is more than a bit user unfriendly. This commit therefore creates an ___srcu_struct_ptrs linker section, and pointers to srcu_struct structures created by DEFINE_SRCU() and DEFINE_STATIC_SRCU() within a module are placed into that module's ___srcu_struct_ptrs section. The required init_srcu_struct() and cleanup_srcu_struct() functions are then automatically invoked as needed when that module is loaded and unloaded, thus allowing modules to continue to use DEFINE_SRCU() and DEFINE_STATIC_SRCU() while avoiding the need to increase the size of the reserved region. Many of the algorithms and some of the code was cheerfully cherry-picked from other code making use of linker sections, perhaps most notably from tracepoints. All bugs are nevertheless the sole property of the author. Suggested-by: Mathieu Desnoyers [ paulmck: Use __section() and use "default" in srcu_module_notify()'s "switch" statement as suggested by Joel Fernandes. ] Signed-off-by: Paul E. McKenney Tested-by: Joel Fernandes (Google) --- include/asm-generic/vmlinux.lds.h | 4 +++ include/linux/module.h | 5 +++ include/linux/srcutree.h | 14 +++++++-- kernel/module.c | 5 +++ kernel/rcu/srcutree.c | 65 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 088987e9a3ea..ba1ad39468fc 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -337,6 +337,10 @@ KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \ __stop___tracepoints_ptrs = .; \ *(__tracepoints_strings)/* Tracepoints: strings */ \ + . = ALIGN(8); \ + __start___srcu_struct = .; \ + *(___srcu_struct_ptrs) \ + __end___srcu_struct = .; \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ diff --git a/include/linux/module.h b/include/linux/module.h index 188998d3dca9..1455812dd325 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -450,6 +451,10 @@ struct module { unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif +#ifdef CONFIG_TREE_SRCU + unsigned int num_srcu_structs; + struct srcu_struct **srcu_struct_ptrs; +#endif #ifdef CONFIG_BPF_EVENTS unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 7f7c8c050f63..8af1824c46a8 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -120,9 +120,17 @@ struct srcu_struct { * * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data) +#ifdef MODULE +# define __DEFINE_SRCU(name, is_static) \ + is_static struct srcu_struct name; \ + struct srcu_struct *__srcu_struct_##name \ + __section("___srcu_struct_ptrs") = &name +#else +# define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ + is_static struct srcu_struct name = \ + __SRCU_STRUCT_INIT(name, name##_srcu_data) +#endif #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..c79a53b629b6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3095,6 +3095,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_TREE_SRCU + mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", + sizeof(*mod->srcu_struct_ptrs), + &mod->num_srcu_structs); +#endif #ifdef CONFIG_BPF_EVENTS mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", sizeof(*mod->bpf_raw_events), diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9b761e546de8..2ded2614a2f4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1310,3 +1310,68 @@ void __init srcu_init(void) queue_work(rcu_gp_wq, &ssp->work.work); } } + +#ifdef CONFIG_MODULES + +/* Initialize any global-scope srcu_struct structures used by this module. */ +static int srcu_module_coming(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + int ret; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ret = init_srcu_struct(*(sspp++)); + if (WARN_ON_ONCE(ret)) + return ret; + } + return 0; +} + +/* Clean up any global-scope srcu_struct structures used by this module. */ +static void srcu_module_going(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) + cleanup_srcu_struct(*(sspp++)); +} + +/* Handle one module, either coming or going. */ +static int srcu_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = srcu_module_coming(mod); + break; + case MODULE_STATE_GOING: + srcu_module_going(mod); + break; + default: + break; + } + return ret; +} + +static struct notifier_block srcu_module_nb = { + .notifier_call = srcu_module_notify, + .priority = 0, +}; + +static __init int init_srcu_module_notifier(void) +{ + int ret; + + ret = register_module_notifier(&srcu_module_nb); + if (ret) + pr_warn("Failed to register srcu module notifier\n"); + return ret; +} +late_initcall(init_srcu_module_notifier); + +#endif /* #ifdef CONFIG_MODULES */ -- cgit v1.2.3 From 11b000457f4638cf2a9e6794d31636d2d3174842 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 09:22:56 +0800 Subject: rcu: Make __call_srcu static Because __call_srcu() is not used outside kernel/rcu/srcutree.c, this commit makes it static. Signed-off-by: Jiang Biao Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2ded2614a2f4..cf0e886314f2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func, bool do_norm) +static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) { unsigned long flags; int idx; -- cgit v1.2.3 From 95bf33b55ff4465399bad843f1d8d618c8baf1f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 14:07:24 +0200 Subject: rcu/sync: Kill rcu_sync_type/gp_type Now that the RCU flavors have been consolidated, rcu_sync_type makes no sense because none of internal update functions aside from .held() depend on gp_type. This commit therefore removes this field and consolidates the relevant code. Signed-off-by: Oleg Nesterov [ paulmck: Added RCU and RCU-bh checks to rcu_sync_is_idle(). ] [ paulmck: And applied subsequent feedback from Oleg Nesterov. ] Signed-off-by: Paul E. McKenney --- include/linux/percpu-rwsem.h | 2 +- include/linux/rcu_sync.h | 36 ++++++++-------------------- kernel/locking/percpu-rwsem.c | 2 +- kernel/rcu/sync.c | 55 ++++--------------------------------------- 4 files changed, 17 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 03cb4b6f842e..6887636ea169 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -20,7 +20,7 @@ struct percpu_rw_semaphore { #define DEFINE_STATIC_PERCPU_RWSEM(name) \ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ static struct percpu_rw_semaphore name = { \ - .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ + .rss = __RCU_SYNC_INITIALIZER(name.rss), \ .read_count = &__percpu_rwsem_rc_##name, \ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ .writer = __RCUWAIT_INITIALIZER(name.writer), \ diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 6fc53a1345b3..87971e85519c 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -13,8 +13,6 @@ #include #include -enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC }; - /* Structure to mediate between updaters and fastpath-using readers. */ struct rcu_sync { int gp_state; @@ -23,52 +21,38 @@ struct rcu_sync { int cb_state; struct rcu_head cb_head; - - enum rcu_sync_type gp_type; }; -extern void rcu_sync_lockdep_assert(struct rcu_sync *); - /** * rcu_sync_is_idle() - Are readers permitted to use their fastpaths? * @rsp: Pointer to rcu_sync structure to use for synchronization * - * Returns true if readers are permitted to use their fastpaths. - * Must be invoked within an RCU read-side critical section whose - * flavor matches that of the rcu_sync struture. + * Returns true if readers are permitted to use their fastpaths. Must be + * invoked within some flavor of RCU read-side critical section. */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { -#ifdef CONFIG_PROVE_RCU - rcu_sync_lockdep_assert(rsp); -#endif + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && + !rcu_read_lock_bh_held() && + !rcu_read_lock_sched_held(), + "suspicious rcu_sync_is_idle() usage"); return !rsp->gp_state; /* GP_IDLE */ } -extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_init(struct rcu_sync *); extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); -#define __RCU_SYNC_INITIALIZER(name, type) { \ +#define __RCU_SYNC_INITIALIZER(name) { \ .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ .cb_state = 0, \ - .gp_type = type, \ } -#define __DEFINE_RCU_SYNC(name, type) \ - struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type) - -#define DEFINE_RCU_SYNC(name) \ - __DEFINE_RCU_SYNC(name, RCU_SYNC) - -#define DEFINE_RCU_SCHED_SYNC(name) \ - __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC) - -#define DEFINE_RCU_BH_SYNC(name) \ - __DEFINE_RCU_SYNC(name, RCU_BH_SYNC) +#define DEFINE_RCU_SYNC(name) \ + struct rcu_sync name = __RCU_SYNC_INITIALIZER(name) #endif /* _LINUX_RCU_SYNC_H_ */ diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f17dad99eec8..48cab93a47fd 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + rcu_sync_init(&sem->rss); __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); sem->readers_block = 0; diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index a8304d90573f..ee427e138dad 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,65 +10,20 @@ #include #include -#ifdef CONFIG_PROVE_RCU -#define __INIT_HELD(func) .held = func, -#else -#define __INIT_HELD(func) -#endif - -static const struct { - void (*sync)(void); - void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); - void (*wait)(void); -#ifdef CONFIG_PROVE_RCU - int (*held)(void); -#endif -} gp_ops[] = { - [RCU_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_held) - }, - [RCU_SCHED_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_sched_held) - }, - [RCU_BH_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_bh_held) - }, -}; - enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; #define rss_lock gp_wait.lock -#ifdef CONFIG_PROVE_RCU -void rcu_sync_lockdep_assert(struct rcu_sync *rsp) -{ - RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), - "suspicious rcu_sync_is_idle() usage"); -} - -EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); -#endif - /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized * @type: Flavor of RCU with which to synchronize rcu_sync structure */ -void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +void rcu_sync_init(struct rcu_sync *rsp) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - rsp->gp_type = type; } /** @@ -114,7 +69,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) WARN_ON_ONCE(need_wait && need_sync); if (need_sync) { - gp_ops[rsp->gp_type].sync(); + synchronize_rcu(); rsp->gp_state = GP_PASSED; wake_up_all(&rsp->gp_wait); } else if (need_wait) { @@ -167,7 +122,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * to catch a later GP. */ rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + call_rcu(&rsp->cb_head, rcu_sync_func); } else { /* * We're at least a GP after rcu_sync_exit(); eveybody will now @@ -195,7 +150,7 @@ void rcu_sync_exit(struct rcu_sync *rsp) if (!--rsp->gp_count) { if (rsp->cb_state == CB_IDLE) { rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + call_rcu(&rsp->cb_head, rcu_sync_func); } else if (rsp->cb_state == CB_PENDING) { rsp->cb_state = CB_REPLAY; } @@ -220,7 +175,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp) spin_unlock_irq(&rsp->rss_lock); if (cb_state != CB_IDLE) { - gp_ops[rsp->gp_type].wait(); + rcu_barrier(); WARN_ON_ONCE(rsp->cb_state != CB_IDLE); } } -- cgit v1.2.3 From 2bf1acc299c9757932ef8c6edfaacca6d08302b1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 17:21:02 +0200 Subject: uprobes: Use DEFINE_STATIC_PERCPU_RWSEM() to initialize dup_mmap_sem Use DEFINE_STATIC_PERCPU_RWSEM() to initialize dup_mmap_sem. Signed-off-by: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/events/uprobes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 78f61bfc6b79..97c367f0a9aa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) -static struct percpu_rw_semaphore dup_mmap_sem; +DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -2302,7 +2302,5 @@ void __init uprobes_init(void) for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); - BUG_ON(register_die_notifier(&uprobe_exception_nb)); } -- cgit v1.2.3 From 3f2947b78151ec938dc06aea4ba0e11e56becdff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 18:32:41 +0200 Subject: locking/percpu-rwsem: Add DEFINE_PERCPU_RWSEM(), use it to initialize cgroup_threadgroup_rwsem Turn DEFINE_STATIC_PERCPU_RWSEM() into __DEFINE_PERCPU_RWSEM() with the additional "is_static" argument to introduce DEFINE_PERCPU_RWSEM(). Change cgroup.c to use DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem). Signed-off-by: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/percpu-rwsem.h | 8 ++++++-- kernel/cgroup/cgroup.c | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 6887636ea169..2809b44cbbee 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -17,14 +17,18 @@ struct percpu_rw_semaphore { int readers_block; }; -#define DEFINE_STATIC_PERCPU_RWSEM(name) \ +#define __DEFINE_PERCPU_RWSEM(name, is_static) \ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ -static struct percpu_rw_semaphore name = { \ +is_static struct percpu_rw_semaphore name = { \ .rss = __RCU_SYNC_INITIALIZER(name.rss), \ .read_count = &__percpu_rwsem_rc_##name, \ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ .writer = __RCUWAIT_INITIALIZER(name.writer), \ } +#define DEFINE_PERCPU_RWSEM(name) \ + __DEFINE_PERCPU_RWSEM(name, /* not static */) +#define DEFINE_STATIC_PERCPU_RWSEM(name) \ + __DEFINE_PERCPU_RWSEM(name, static) extern int __percpu_down_read(struct percpu_rw_semaphore *, int); extern void __percpu_up_read(struct percpu_rw_semaphore *); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..b112e93388dc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ @@ -5616,7 +5616,6 @@ int __init cgroup_init(void) int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); -- cgit v1.2.3 From 89da3b94bb97417ca2c5b0ce3a28643819030247 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Apr 2019 18:50:55 +0200 Subject: rcu/sync: Simplify the state machine With this patch rcu_sync has a single state variable and the transition rules become really simple: GP_IDLE - owned by the first rcu_sync_enter() which moves it to GP_ENTER - owned by rcu-callback which moves it to GP_PASSED - owned by the last rcu_sync_exit() which moves it to GP_EXIT - and this is the only "nontrivial" state. rcu-callback moves it back to GP_IDLE unless another enter() comes before a GP pass. If rcu-callback is invoked before the next rcu_sync_exit() it must see gp_count incremented by that enter() and set GP_PASSED. Otherwise, if the next rcu_sync_exit() wins the race, it will move it to GP_REPLAY - owned by rcu-callback which moves it to GP_EXIT Signed-off-by: Oleg Nesterov [ paulmck: While here, apply READ_ONCE() and WRITE_ONCE() to ->gp_state. ] [ paulmck: Tweaks to make htmldocs happy. (Reported by kbuild test robot.) ] Signed-off-by: Paul E. McKenney --- include/linux/rcu_sync.h | 4 +- kernel/rcu/sync.c | 165 +++++++++++++++++++++++++++-------------------- 2 files changed, 96 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 87971e85519c..9b83865d24f9 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -19,7 +19,6 @@ struct rcu_sync { int gp_count; wait_queue_head_t gp_wait; - int cb_state; struct rcu_head cb_head; }; @@ -36,7 +35,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) !rcu_read_lock_bh_held() && !rcu_read_lock_sched_held(), "suspicious rcu_sync_is_idle() usage"); - return !rsp->gp_state; /* GP_IDLE */ + return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } extern void rcu_sync_init(struct rcu_sync *); @@ -49,7 +48,6 @@ extern void rcu_sync_dtor(struct rcu_sync *); .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ - .cb_state = 0, \ } #define DEFINE_RCU_SYNC(name) \ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index ee427e138dad..d4558ab7a07d 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,15 +10,13 @@ #include #include -enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; -enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; +enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; #define rss_lock gp_wait.lock /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized - * @type: Flavor of RCU with which to synchronize rcu_sync structure */ void rcu_sync_init(struct rcu_sync *rsp) { @@ -41,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) rsp->gp_state = GP_PASSED; } -/** - * rcu_sync_enter() - Force readers onto slowpath - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * This function is used by updaters who need readers to make use of - * a slowpath during the update. After this function returns, all - * subsequent calls to rcu_sync_is_idle() will return false, which - * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. - * - * When called in isolation, rcu_sync_enter() must wait for a grace - * period, however, closely spaced calls to rcu_sync_enter() can - * optimize away the grace-period wait via a state machine implemented - * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). - */ -void rcu_sync_enter(struct rcu_sync *rsp) -{ - bool need_wait, need_sync; - spin_lock_irq(&rsp->rss_lock); - need_wait = rsp->gp_count++; - need_sync = rsp->gp_state == GP_IDLE; - if (need_sync) - rsp->gp_state = GP_PENDING; - spin_unlock_irq(&rsp->rss_lock); +static void rcu_sync_func(struct rcu_head *rhp); - WARN_ON_ONCE(need_wait && need_sync); - if (need_sync) { - synchronize_rcu(); - rsp->gp_state = GP_PASSED; - wake_up_all(&rsp->gp_wait); - } else if (need_wait) { - wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); - } else { - /* - * Possible when there's a pending CB from a rcu_sync_exit(). - * Nobody has yet been allowed the 'fast' path and thus we can - * avoid doing any sync(). The callback will get 'dropped'. - */ - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - } +static void rcu_sync_call(struct rcu_sync *rsp) +{ + call_rcu(&rsp->cb_head, rcu_sync_func); } /** * rcu_sync_func() - Callback function managing reader access to fastpath * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * - * This function is passed to one of the call_rcu() functions by + * This function is passed to call_rcu() function by rcu_sync_enter() and * rcu_sync_exit(), so that it is invoked after a grace period following the - * that invocation of rcu_sync_exit(). It takes action based on events that + * that invocation of enter/exit. + * + * If it is called by rcu_sync_enter() it signals that all the readers were + * switched onto slow path. + * + * If it is called by rcu_sync_exit() it takes action based on events that * have taken place in the meantime, so that closely spaced rcu_sync_enter() * and rcu_sync_exit() pairs need not wait for a grace period. * @@ -107,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - WARN_ON_ONCE(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { /* - * A new rcu_sync_begin() has happened; drop the callback. + * We're at least a GP after the GP_IDLE->GP_ENTER transition. */ - rsp->cb_state = CB_IDLE; - } else if (rsp->cb_state == CB_REPLAY) { + WRITE_ONCE(rsp->gp_state, GP_PASSED); + wake_up_locked(&rsp->gp_wait); + } else if (rsp->gp_state == GP_REPLAY) { /* - * A new rcu_sync_exit() has happened; requeue the callback - * to catch a later GP. + * A new rcu_sync_exit() has happened; requeue the callback to + * catch a later GP. */ - rsp->cb_state = CB_PENDING; - call_rcu(&rsp->cb_head, rcu_sync_func); + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); } else { /* - * We're at least a GP after rcu_sync_exit(); eveybody will now - * have observed the write side critical section. Let 'em rip!. + * We're at least a GP after the last rcu_sync_exit(); eveybody + * will now have observed the write side critical section. + * Let 'em rip!. */ - rsp->cb_state = CB_IDLE; - rsp->gp_state = GP_IDLE; + WRITE_ONCE(rsp->gp_state, GP_IDLE); } spin_unlock_irqrestore(&rsp->rss_lock, flags); } /** - * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + int gp_state; + + spin_lock_irq(&rsp->rss_lock); + gp_state = rsp->gp_state; + if (gp_state == GP_IDLE) { + WRITE_ONCE(rsp->gp_state, GP_ENTER); + WARN_ON_ONCE(rsp->gp_count); + /* + * Note that we could simply do rcu_sync_call(rsp) here and + * avoid the "if (gp_state == GP_IDLE)" block below. + * + * However, synchronize_rcu() can be faster if rcu_expedited + * or rcu_blocking_is_gp() is true. + * + * Another reason is that we can't wait for rcu callback if + * we are called at early boot time but this shouldn't happen. + */ + } + rsp->gp_count++; + spin_unlock_irq(&rsp->rss_lock); + + if (gp_state == GP_IDLE) { + /* + * See the comment above, this simply does the "synchronous" + * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. + */ + synchronize_rcu(); + rcu_sync_func(&rsp->cb_head); + /* Not really needed, wait_event() would see GP_PASSED. */ + return; + } + + wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast path after grace period * @rsp: Pointer to rcu_sync structure to use for synchronization * * This function is used by updaters who have completed, and can therefore @@ -146,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); + spin_lock_irq(&rsp->rss_lock); if (!--rsp->gp_count) { - if (rsp->cb_state == CB_IDLE) { - rsp->cb_state = CB_PENDING; - call_rcu(&rsp->cb_head, rcu_sync_func); - } else if (rsp->cb_state == CB_PENDING) { - rsp->cb_state = CB_REPLAY; + if (rsp->gp_state == GP_PASSED) { + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else if (rsp->gp_state == GP_EXIT) { + WRITE_ONCE(rsp->gp_state, GP_REPLAY); } } spin_unlock_irq(&rsp->rss_lock); @@ -164,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) */ void rcu_sync_dtor(struct rcu_sync *rsp) { - int cb_state; + int gp_state; - WARN_ON_ONCE(rsp->gp_count); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); - if (rsp->cb_state == CB_REPLAY) - rsp->cb_state = CB_PENDING; - cb_state = rsp->cb_state; + if (rsp->gp_state == GP_REPLAY) + WRITE_ONCE(rsp->gp_state, GP_EXIT); + gp_state = rsp->gp_state; spin_unlock_irq(&rsp->rss_lock); - if (cb_state != CB_IDLE) { + if (gp_state != GP_IDLE) { rcu_barrier(); - WARN_ON_ONCE(rsp->cb_state != CB_IDLE); + WARN_ON_ONCE(rsp->gp_state != GP_IDLE); } } -- cgit v1.2.3 From 140e53f20b159722903f0c87358bcd809aa9767e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 10:08:18 -0700 Subject: rcutorture: Add cond_resched() to forward-progress free-up loop The rcu_torture_fwd_prog_cbfree() function frees callbacks used during rcutorture's call_rcu() forward-progress test, but does so in a tight loop. This could cause problems given a very long list of callbacks to be freed, and actual testing produces lists with as many as 25M callbacks. This commit therefore adds a cond_resched() to this loop. While in the area, this commit also rearranges the lock releases to look a bit more sane. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efaa5b3f4d3f..7906ba2d9dad 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1674,16 +1674,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) for (;;) { spin_lock_irqsave(&rcu_fwd_lock, flags); rfcp = rcu_fwd_cb_head; - if (!rfcp) + if (!rfcp) { + spin_unlock_irqrestore(&rcu_fwd_lock, flags); break; + } rcu_fwd_cb_head = rfcp->rfc_next; if (!rcu_fwd_cb_head) rcu_fwd_cb_tail = &rcu_fwd_cb_head; spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; + cond_resched(); } - spin_unlock_irqrestore(&rcu_fwd_lock, flags); return freed; } -- cgit v1.2.3 From e8516c64fe97e27a28fd5bc65b616508ae0020cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 11:06:32 -0700 Subject: rcutorture: Fix stutter_wait() return value and freelist checks The stutter_wait() function is supposed to return true if it actually waits and false otherwise, but it instead unconditionally returns false. Which hides a bug in rcu_torture_writer() that fails to account for the fact that one of the rcu_tortures[] array elements will normally be referenced by rcu_torture_current, and thus not be on the freelist. This commit therefore corrects the stutter_wait() return value and adds a check for rcu_torture_current to rcu_torture_writer()'s check that things get freed after everything goes quiescent. In addition, this commit causes torture_stutter() to give a bit more than one second (instead of only one jiffy) warning of the end of the stutter interval. Finally, this commit disables long-delay readers and aggressive update-side forward-progress checks while forward-progress testing is in flight. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++++---- kernel/torture.c | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7906ba2d9dad..954ac2b98619 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1010,10 +1010,13 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer")) + if (stutter_wait("rcu_torture_writer") && + !READ_ONCE(rcu_fwd_cb_nodelay)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) - if (list_empty(&rcu_tortures[i].rtort_free)) - WARN_ON_ONCE(1); + if (list_empty(&rcu_tortures[i].rtort_free) && + rcu_access_pointer(rcu_torture_current) != + &rcu_tortures[i]) + WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1709,6 +1712,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Tight loop containing cond_resched(). */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1747,6 +1752,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WARN_ON(READ_ONCE(fcs.stop) != 2); destroy_rcu_head_on_stack(&fcs.rh); } + schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } /* Carry out call_rcu() forward-progress testing. */ @@ -1816,7 +1823,6 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", @@ -1827,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void) n_max_gps, n_max_cbs, cver, gps); rcu_torture_fwd_cb_hist(); } + schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } diff --git a/kernel/torture.c b/kernel/torture.c index 17b2be9bde12..de0e0ecf88e1 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -578,10 +578,12 @@ static int stutter; bool stutter_wait(const char *title) { int spt; + bool ret = false; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { + ret = true; if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -592,7 +594,7 @@ bool stutter_wait(const char *title) } torture_shutdown_absorb(title); } - return !!spt; + return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -602,13 +604,20 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + int wtime; + VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - WRITE_ONCE(stutter_pause_test, 1); - schedule_timeout_interruptible(stutter - 1); + wtime = stutter; + if (stutter > HZ + 1) { + WRITE_ONCE(stutter_pause_test, 1); + wtime = stutter - HZ - 1; + schedule_timeout_interruptible(wtime); + wtime = HZ + 1; + } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(wtime); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) -- cgit v1.2.3 From ff3bf92d90d396e51eb78c5ecde11a994ab7a179 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 14:44:49 -0700 Subject: torture: Allow inter-stutter interval to be specified Currently, the inter-stutter interval is the same as the stutter duration, that is, whatever number of jiffies is passed into torture_stutter_init(). This has worked well for quite some time, but the addition of forward-progress testing to rcutorture can delay processes for several seconds, which can triple the time that they are stuttered. This commit therefore adds a second argument to torture_stutter_init() that specifies the inter-stutter interval. While locktorture preserves the current behavior, rcutorture uses the RCU CPU stall warning interval to provide a wider inter-stutter interval. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 2 +- kernel/rcu/rcutorture.c | 5 ++++- kernel/torture.c | 6 ++++-- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 23d80db426d7..a620118385bb 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -66,7 +66,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)); /* Task stuttering, which forces load/no-load transitions. */ bool stutter_wait(const char *title); -int torture_stutter_init(int s); +int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 80a463d31a8d..c513031cd7e3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -975,7 +975,7 @@ static int __init lock_torture_init(void) goto unwind; } if (stutter > 0) { - firsterr = torture_stutter_init(stutter); + firsterr = torture_stutter_init(stutter, stutter); if (firsterr) goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 954ac2b98619..a16d6abe1715 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2373,7 +2373,10 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - firsterr = torture_stutter_init(stutter * HZ); + int t; + + t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; + firsterr = torture_stutter_init(stutter * HZ, t); if (firsterr) goto unwind; } diff --git a/kernel/torture.c b/kernel/torture.c index de0e0ecf88e1..a8d9bdfba7c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void) static struct task_struct *stutter_task; static int stutter_pause_test; static int stutter; +static int stutter_gap; /* * Block until the stutter interval ends. This must be called periodically @@ -621,7 +622,7 @@ static int torture_stutter(void *arg) } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter); + schedule_timeout_interruptible(stutter_gap); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -631,9 +632,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(const int s) +int torture_stutter_init(const int s, const int sgap) { stutter = s; + stutter_gap = sgap; return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); -- cgit v1.2.3 From 5eabea594b4ce9ba0fbd8618bd3bf01aa9f48af7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Apr 2019 09:02:46 -0700 Subject: rcutorture: Exempt tasks RCU from timely draining of grace periods After the end of each stutter pause interval, the rcu_torture_writer() kthread checks to be sure that all prior callbacks have completed so that all the test structures have been freed. This works fine except for tasks RCU, in which grace periods can take one good long time. This commit therefore exempts tasks RCU from this check. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a16d6abe1715..6a4558532eac 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,6 +299,7 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; + int slow_gps; const char *name; }; @@ -667,6 +668,7 @@ static struct rcu_torture_ops tasks_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, + .slow_gps = 1, .name = "tasks" }; @@ -1011,7 +1013,8 @@ rcu_torture_writer(void *arg) } rcu_torture_writer_state = RTWS_STUTTER; if (stutter_wait("rcu_torture_writer") && - !READ_ONCE(rcu_fwd_cb_nodelay)) + !READ_ONCE(rcu_fwd_cb_nodelay) && + !cur_ops->slow_gps) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != -- cgit v1.2.3 From ab21f6081f7bc09a0918ef888de795d59a907c1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 14 Apr 2019 18:30:22 -0700 Subject: rcutorture: Give the scheduler a chance on PREEMPT && NO_HZ_FULL kernels In !PREEMPT kernels, cond_resched() is a no-op. In NO_HZ_FULL kernels, in-kernel execution (such as that of rcutorture's kthreads) might extend indefinitely without the scheduler gaining the aid of a scheduling-clock interrupt. This combination can make the interaction of an rcutorture forward-progress test and a CPU-hotplug stop_machine operation make less forward progress than one might like. Additionally, Sebastian Siewior notes that NO_HZ_FULL kernels have a scheduler check upon return to userspace execution, which suggests that in-kernel emulation of tight userspace loops containing system calls doing call_rcu() might also need explicit checks in the PREEMPT && NO_HZ_FULL case. This commit therefore introduces a rcu_torture_fwd_prog_cond_resched() function that explicitly invokes schedule() in such kernels whenever need_resched() returns true, while retaining use of cond_resched() for kernels that are either !PREEMPT or !NO_HZ_FULL. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6a4558532eac..ef6f6dedf4c4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1667,6 +1667,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_unlock_irqrestore(&rcu_fwd_lock, flags); } +// Give the scheduler a chance, even on nohz_full CPUs. +static void rcu_torture_fwd_prog_cond_resched(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (need_resched()) + schedule(); + } else { + cond_resched(); + } +} + /* * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. @@ -1690,7 +1701,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } return freed; } @@ -1734,7 +1745,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1817,7 +1828,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); -- cgit v1.2.3 From 3432d765c59ba026de49bd4f1f0c2adeff0e7a16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Apr 2019 14:50:05 -0700 Subject: rcutorture: Halt forward-progress checks at end of run Once removed, an rcu_torture element can be deferred-freed by a chain of call_rcu() invocations, with each callback invoking another round of call_rcu() until either a fixed number of call_rcu() invocations have been chained or until the test ends. This means that if the test ends, some of the rcu_torture elements will be "stranded" partway through the deferred-free process, which results in false-positive warnings from rcu_torture_writer() due to lack of forward progress should the test end just at the end of a stutter interval. This commit therefore suppresses rcu_torture_writer()'s forward-progress checks when the test ends in order to avoid these false-positive reports.. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ef6f6dedf4c4..a3f5488a319a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1014,7 +1014,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state = RTWS_STUTTER; if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && - !cur_ops->slow_gps) + !cur_ops->slow_gps && + !torture_must_stop()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != -- cgit v1.2.3 From c682db558e6eec10a711b0a6bcb8c35fd15f6a39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Apr 2019 07:38:27 -0700 Subject: rcutorture: Add trivial RCU implementation I have been showing off a trivial RCU implementation for non-preemptive environments for some time now: #define rcu_read_lock() #define rcu_read_unlock() #define rcu_dereference(p) READ_ONCE(p) #define rcu_assign_pointer(p, v) smp_store_release(&(p), (v)) void synchronize_rcu(void) { int cpu; for_each_online_cpu(cpu) sched_setaffinity(current->pid, cpumask_of(cpu)); } Trivial or not, as the old saying goes, "if it ain't tested, it don't work!". This commit therefore adds a "trivial" flavor to rcutorture and a corresponding TRIVIAL test scenario. This variant does not handle CPU hotplug, which is unconditionally enabled on x86 for post-v5.1-rc3 kernels, which is why the TRIVIAL.boot says "rcutorture.onoff_interval=0". This commit actually does handle CONFIG_PREEMPT=y kernels, but only because it turns back the Linux-kernel clock in order to provide these alternative definitions (or the moral equivalent thereof): #define rcu_read_lock() preempt_disable() #define rcu_read_unlock() preempt_enable() In CONFIG_PREEMPT=n kernels without debugging, these are equivalent to empty macros give or take a compiler barrier. However, the have been successfully tested with actual empty macros as well. Signed-off-by: Paul E. McKenney [ paulmck: Fix symbol issue reported by kbuild test robot . ] [ paulmck: Work around sched_setaffinity() issue noted by Andrea Parri. ] [ paulmck: Add rcutorture.shuffle_interval=0 to TRIVIAL.boot to fix interaction with shuffler task noted by Peter Zijlstra. ] Tested-by: Andrea Parri --- kernel/rcu/rcu.h | 5 +++ kernel/rcu/rcutorture.c | 45 +++++++++++++++++++++- kernel/rcu/update.c | 13 +++++++ .../selftests/rcutorture/configs/rcu/TRIVIAL | 14 +++++++ .../selftests/rcutorture/configs/rcu/TRIVIAL.boot | 3 ++ 5 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 390aab20115e..5290b01de534 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -446,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; @@ -479,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +#endif + #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a3f5488a319a..6b803fb2f7ca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -672,6 +672,47 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1789,6 +1830,8 @@ static void rcu_torture_fwd_prog_cr(void) if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ + if (!cur_ops->call) + return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); @@ -2265,7 +2308,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, + &busted_srcud_ops, &tasks_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c3bf44ba42e5..61df2bf08563 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +/* Get rcutorture access to sched_setaffinity(). */ +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + int ret; + + ret = sched_setaffinity(pid, in_mask); + WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +#endif + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL new file mode 100644 index 000000000000..4d8eb5bfb6f6 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL @@ -0,0 +1,14 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=8 +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot new file mode 100644 index 000000000000..7017f5f5a55f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot @@ -0,0 +1,3 @@ +rcutorture.torture_type=trivial +rcutorture.onoff_interval=0 +rcutorture.shuffle_interval=0 -- cgit v1.2.3 From 34aa34b818407bd475786cf160f7838b7a485e87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 May 2019 16:15:16 -0700 Subject: rcutorture: Dump trace buffer for callback pipe drain failures Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b803fb2f7ca..89be0f492f78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1060,8 +1060,10 @@ rcu_torture_writer(void *arg) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) + &rcu_tortures[i]) { + rcu_ftrace_dump(DUMP_ALL); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + } } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) -- cgit v1.2.3 From 354ea05d0276384045fabbfd62ccd2d985defa9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 May 2019 12:36:53 -0700 Subject: rcutorture: Upper case solves the case of the vanishing NULL pointer Various security techniques can obfuscate pointer printouts on the console. Unfortunately, rcutorture relies on either "null" or all zeroes to identify the last few statistics printouts at the end of the test. These need to be identified because failing to do so will results in false-positive complaints about grace-period hangs. This commit therefore prints the "ver:" in capitals ("VER:") when the RCU-protected pointer has been set to NULL, which causes rcutorture's parse-console.sh script to correctly ignore these lines. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 89be0f492f78..fce4e7e6f502 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1408,8 +1408,9 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); - pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, + rcu_torture_current ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), -- cgit v1.2.3 From 96050c68be33edef18800ad6748f61f81db81a20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Apr 2019 01:40:54 -0700 Subject: rcu: Upgrade sync_exp_work_done() to smp_mb() The sync_exp_work_done() function uses smp_mb__before_atomic(), but there is no obvious atomic in the ensuing code. The ordering is absolutely required for grace periods to work correctly, so this commit upgrades the smp_mb__before_atomic() to smp_mb(). Fixes: 6fba2b3767ea ("rcu: Remove deprecated RCU debugfs tracing code") Reported-by: Andrea Parri Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..d969650a72c6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ + smp_mb(); /* Ensure test happens before caller kfree(). */ return true; } return false; -- cgit v1.2.3