summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 19:49:06 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 19:49:06 +0300
commitc0b9620bc3f0a0f914996cc6631522d41870a9e0 (patch)
tree8fdf2fed856bebe2c3e60302df3fb64cafa87339 /kernel
parent736676f5c3abd1fc01c41813a95246e892937f6d (diff)
parent64619b283bb35b12a96129e82b40304f7e5551b7 (diff)
downloadlinux-c0b9620bc3f0a0f914996cc6631522d41870a9e0.tar.xz
Merge tag 'rcu.next.v6.10' of https://github.com/urezki/linux
Pull RCU updates from Uladzislau Rezki: - Fix a lockdep complain for lazy-preemptible kernel, remove redundant BH disable for TINY_RCU, remove redundant READ_ONCE() in tree.c, fix false positives KCSAN splat and fix buffer overflow in the print_cpu_stall_info(). - Misc updates related to bpf, tracing and update the MAINTAINERS file. - An improvement of a normal synchronize_rcu() call in terms of latency. It maintains a separate track for sync. users only. This approach bypasses per-cpu nocb-lists thus sync-users do not depend on nocb-list length and how fast regular callbacks are processed. - RCU tasks: switch tasks RCU grace periods to sleep at TASK_IDLE priority, fix some comments, add some diagnostic warning to the exit_tasks_rcu_start() and fix a buffer overflow in the show_rcu_tasks_trace_gp_kthread(). - RCU torture: Increase memory to guest OS, fix a Tasks Rude RCU testing, some updates for TREE09, dump mode information to debug GP kthread state, remove redundant READ_ONCE(), fix some comments about RCU_TORTURE_PIPE_LEN and pipe_count, remove some redundant pointer initialization, fix a hung splat task by when the rcutorture tests start to exit, fix invalid context warning, add '--do-kvfree' parameter to torture test and use slow register unregister callbacks only for rcutype test. * tag 'rcu.next.v6.10' of https://github.com/urezki/linux: (48 commits) rcutorture: Use rcu_gp_slow_register/unregister() only for rcutype test torture: Scale --do-kvfree test time rcutorture: Fix invalid context warning when enable srcu barrier testing rcutorture: Make stall-tasks directly exit when rcutorture tests end rcutorture: Removing redundant function pointer initialization rcutorture: Make rcutorture support print rcu-tasks gp state rcutorture: Use the gp_kthread_dbg operation specified by cur_ops rcutorture: Re-use value stored to ->rtort_pipe_count instead of re-reading rcutorture: Fix rcu_torture_one_read() pipe_count overflow comment rcutorture: Remove extraneous rcu_torture_pipe_update_one() READ_ONCE() rcu: Allocate WQ with WQ_MEM_RECLAIM bit set rcu: Support direct wake-up of synchronize_rcu() users rcu: Add a trace event for synchronize_rcu_normal() rcu: Reduce synchronize_rcu() latency rcu: Fix buffer overflow in print_cpu_stall_info() rcu: Mollify sparse with RCU guard rcu-tasks: Fix show_rcu_tasks_trace_gp_kthread buffer overflow rcu-tasks: Fix the comments for tasks_rcu_exit_srcu_stall_timer rcu-tasks: Replace exit_tasks_rcu_start() initialization with WARN_ON_ONCE() rcu: Remove redundant CONFIG_PROVE_RCU #if condition ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Kconfig2
-rw-r--r--kernel/bpf/trampoline.c2
-rw-r--r--kernel/rcu/Kconfig8
-rw-r--r--kernel/rcu/rcu.h20
-rw-r--r--kernel/rcu/rcutorture.c85
-rw-r--r--kernel/rcu/srcutiny.c31
-rw-r--r--kernel/rcu/srcutree.c5
-rw-r--r--kernel/rcu/sync.c8
-rw-r--r--kernel/rcu/tasks.h44
-rw-r--r--kernel/rcu/tiny.c4
-rw-r--r--kernel/rcu/tree.c430
-rw-r--r--kernel/rcu/tree.h24
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/rcu/tree_stall.h11
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/ftrace.c3
18 files changed, 580 insertions, 111 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bc25f5098a25..4100df44c665 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -28,7 +28,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index db7599c59c78..88673a4267eb 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -333,7 +333,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
NULL, im->ip_epilogue);
WARN_ON(err);
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
else
percpu_ref_kill(&im->pcref);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index e7d2dd267593..3e079de0f5b4 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -31,7 +31,7 @@ config PREEMPT_RCU
config TINY_RCU
bool
- default y if !PREEMPTION && !SMP
+ default y if !PREEMPT_RCU && !SMP
help
This option selects the RCU implementation that is
designed for UP systems from which real-time response
@@ -85,9 +85,13 @@ config FORCE_TASKS_RCU
idle, and user-mode execution as quiescent states. Not for
manual selection in most cases.
-config TASKS_RCU
+config NEED_TASKS_RCU
bool
default n
+
+config TASKS_RCU
+ bool
+ default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO)
select IRQ_WORK
config FORCE_TASKS_RUDE_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 86fce206560e..38238e595a61 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -522,12 +522,18 @@ static inline void show_rcu_tasks_gp_kthreads(void) {}
#ifdef CONFIG_TASKS_RCU
struct task_struct *get_rcu_tasks_gp_kthread(void);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RCU
#ifdef CONFIG_TASKS_RUDE_RCU
struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+#ifdef CONFIG_TASKS_TRACE_RCU
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
+#endif
+
#ifdef CONFIG_TASKS_RCU_GENERIC
void tasks_cblist_init_generic(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -557,8 +563,7 @@ static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif
#if defined(CONFIG_TREE_RCU)
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq);
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq);
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
unsigned long secs,
@@ -566,8 +571,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
unsigned long c);
void rcu_gp_set_torture_wait(int duration);
#else
-static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
- int *flags, unsigned long *gp_seq)
+static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
*flags = 0;
*gp_seq = 0;
@@ -587,20 +591,16 @@ static inline void rcu_gp_set_torture_wait(int duration) { }
#ifdef CONFIG_TINY_SRCU
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
unsigned long *gp_seq)
{
- if (test_type != SRCU_FLAVOR)
- return;
*flags = 0;
*gp_seq = sp->srcu_idx;
}
#elif defined(CONFIG_TREE_SRCU)
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
unsigned long *gp_seq);
#endif
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45d6b4c3d199..807fbf6123a7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -381,6 +381,9 @@ struct rcu_torture_ops {
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ void (*get_gp_data)(int *flags, unsigned long *gp_seq);
+ void (*gp_slow_register)(atomic_t *rgssp);
+ void (*gp_slow_unregister)(atomic_t *rgssp);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -461,12 +464,13 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp)
WRITE_ONCE(rp->rtort_chkp, NULL);
smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
}
- i = READ_ONCE(rp->rtort_pipe_count);
+ i = rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(rp->rtort_pipe_count, i + 1);
- if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
+ if (i + 1 >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
return true;
}
@@ -564,10 +568,12 @@ static struct rcu_torture_ops rcu_ops = {
.call = call_rcu_hurry,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
- .stats = NULL,
.gp_kthread_dbg = show_rcu_gp_kthreads,
.check_boost_failed = rcu_check_boost_fail,
.stall_dur = rcu_jiffies_till_stall_check,
+ .get_gp_data = rcutorture_get_gp_data,
+ .gp_slow_register = rcu_gp_slow_register,
+ .gp_slow_unregister = rcu_gp_slow_unregister,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -611,9 +617,6 @@ static struct rcu_torture_ops rcu_busted_ops = {
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
- .cb_barrier = NULL,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "busted"
};
@@ -627,6 +630,11 @@ static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
+static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq);
+}
+
static int srcu_torture_read_lock(void)
{
if (cur_ops == &srcud_ops)
@@ -735,6 +743,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
@@ -773,6 +782,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
@@ -837,8 +847,6 @@ static struct rcu_torture_ops trivial_ops = {
.get_gp_seq = rcu_no_completed,
.sync = synchronize_rcu_trivial,
.exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "trivial"
};
@@ -881,8 +889,7 @@ static struct rcu_torture_ops tasks_ops = {
.call = call_rcu_tasks,
.cb_barrier = rcu_barrier_tasks,
.gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread,
- .fqs = NULL,
- .stats = NULL,
+ .get_gp_data = rcu_tasks_get_gp_data,
.irq_capable = 1,
.slow_gps = 1,
.name = "tasks"
@@ -921,9 +928,8 @@ static struct rcu_torture_ops tasks_rude_ops = {
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .get_gp_data = rcu_tasks_rude_get_gp_data,
.cbflood_max = 50000,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "tasks-rude"
};
@@ -973,9 +979,8 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .get_gp_data = rcu_tasks_trace_get_gp_data,
.cbflood_max = 50000,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.slow_gps = 1,
.name = "tasks-tracing"
@@ -1399,6 +1404,7 @@ rcu_torture_writer(void *arg)
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
rcu_torture_writer_state = RTWS_DELAY;
udelay(torture_random(&rand) & 0x3ff);
rcu_torture_writer_state = RTWS_REPLACE;
@@ -1414,6 +1420,7 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+ ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count);
// Make sure readers block polled grace periods.
if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
@@ -1586,7 +1593,8 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
- show_rcu_gp_kthreads();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1997,7 +2005,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
+ // Should not happen in a correct RCU implementation,
+ // happens quite often for torture_type=busted.
pipe_count = RCU_TORTURE_PIPE_LEN;
}
completed = cur_ops->get_gp_seq();
@@ -2259,10 +2268,8 @@ rcu_torture_stats_print(void)
int __maybe_unused flags = 0;
unsigned long __maybe_unused gp_seq = 0;
- rcutorture_get_gp_data(cur_ops->ttype,
- &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
- &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
wtp = READ_ONCE(writer_task);
pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
rcu_torture_writer_state_getname(),
@@ -2486,8 +2493,8 @@ static int rcu_torture_stall(void *args)
preempt_disable();
pr_alert("%s start on CPU %d.\n",
__func__, raw_smp_processor_id());
- while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
- stop_at))
+ while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) &&
+ !kthread_should_stop())
if (stall_cpu_block) {
#ifdef CONFIG_PREEMPTION
preempt_schedule();
@@ -2832,13 +2839,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
!shutdown_time_arrived()) {
- WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
- pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
+ if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n",
__func__,
stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
- n_max_gps, n_max_cbs, cver, gps);
+ n_max_gps, n_max_cbs, cver, gps, num_online_cpus());
atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
@@ -3040,11 +3048,12 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
}
/* IPI handler to get callback posted on desired CPU, if online. */
-static void rcu_torture_barrier1cb(void *rcu_void)
+static int rcu_torture_barrier1cb(void *rcu_void)
{
struct rcu_head *rhp = rcu_void;
cur_ops->call(rhp, rcu_torture_barrier_cbf);
+ return 0;
}
/* kthread function to register callbacks used to test RCU barriers. */
@@ -3070,11 +3079,9 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the following ->call().
*/
- if (smp_call_function_single(myid, rcu_torture_barrier1cb,
- &rcu, 1)) {
- // IPI failed, so use direct call from current CPU.
+ if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1))
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
- }
+
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -3340,12 +3347,12 @@ rcu_torture_cleanup(void)
pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
}
- rcu_gp_slow_unregister(NULL);
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
- rcu_gp_slow_unregister(NULL);
return;
}
@@ -3384,8 +3391,8 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
- rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n",
cur_ops->name, (long)gp_seq, flags,
rcutorture_seq_diff(gp_seq, start_gp_seq));
@@ -3444,7 +3451,8 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
- rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -3756,8 +3764,8 @@ rcu_torture_init(void)
nrealreaders = 1;
}
rcu_torture_print_module_parms(cur_ops, "Start of test");
- rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
start_gp_seq = gp_seq;
pr_alert("%s: Start-test grace-period state: g%ld f%#x\n",
cur_ops->name, (long)gp_seq, flags);
@@ -3926,7 +3934,8 @@ rcu_torture_init(void)
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
- rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
+ if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister))
+ cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index c38e5933a5d6..5afd5cf494db 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,9 +96,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+ int newval;
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ preempt_enable();
if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
@@ -117,8 +120,11 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
return; /* Already running or nothing to do. */
+ preempt_enable();
+ }
/* Remove recently arrived callbacks and wait for readers. */
WRITE_ONCE(ssp->srcu_gp_running, true);
@@ -130,9 +136,12 @@ void srcu_drive_gp(struct work_struct *wp)
idx = (ssp->srcu_idx & 0x2) / 2;
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ preempt_enable();
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ preempt_disable(); // Needed for PREEMPT_AUTO
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ preempt_enable();
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -150,8 +159,11 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
+ preempt_disable(); // Needed for PREEMPT_AUTO
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+ preempt_enable();
+ if (idx)
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -160,9 +172,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
+ preempt_disable(); // Needed for PREEMPT_AUTO
cookie = get_state_synchronize_srcu(ssp);
- if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+ preempt_enable();
return;
+ }
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
@@ -170,6 +185,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
else if (list_empty(&ssp->srcu_work.entry))
list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
+ preempt_enable();
}
/*
@@ -183,11 +199,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
+ preempt_disable(); // Needed for PREEMPT_AUTO
local_irq_save(flags);
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
srcu_gp_start_if_needed(ssp);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -241,9 +259,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
*/
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
{
- unsigned long ret = get_state_synchronize_srcu(ssp);
+ unsigned long ret;
+ preempt_disable(); // Needed for PREEMPT_AUTO
+ ret = get_state_synchronize_srcu(ssp);
srcu_gp_start_if_needed(ssp);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e4d673fc30f4..bc4b58b0204e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1826,12 +1826,9 @@ static void process_srcu(struct work_struct *work)
srcu_reschedule(ssp, curdelay);
}
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *ssp, int *flags,
+void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
unsigned long *gp_seq)
{
- if (test_type != SRCU_FLAVOR)
- return;
*flags = 0;
*gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 86df878a2fee..6c2bd9001adc 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* we are called at early boot time but this shouldn't happen.
*/
}
- rsp->gp_count++;
+ WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1);
spin_unlock_irq(&rsp->rss_lock);
if (gp_state == GP_IDLE) {
@@ -151,11 +151,15 @@ void rcu_sync_enter(struct rcu_sync *rsp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
+ int gpc;
+
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
spin_lock_irq(&rsp->rss_lock);
- if (!--rsp->gp_count) {
+ gpc = rsp->gp_count - 1;
+ WRITE_ONCE(rsp->gp_count, gpc);
+ if (!gpc) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
rcu_sync_call(rsp);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 147b5945d67a..e1bf33018e6d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -74,6 +74,7 @@ struct rcu_tasks_percpu {
* @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
* @rtpcpu: This flavor's rcu_tasks_percpu structure.
* @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
* @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
@@ -107,6 +108,7 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ unsigned int wait_state;
struct rcu_tasks_percpu __percpu *rtpcpu;
int percpu_enqueue_shift;
int percpu_enqueue_lim;
@@ -134,6 +136,7 @@ static struct rcu_tasks rt_name = \
.tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
.gp_func = gp, \
.call_func = call, \
+ .wait_state = TASK_UNINTERRUPTIBLE, \
.rtpcpu = &rt_name ## __percpu, \
.lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
.name = n, \
@@ -147,7 +150,7 @@ static struct rcu_tasks rt_name = \
#ifdef CONFIG_TASKS_RCU
-/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
#endif
@@ -638,7 +641,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
// If the grace-period kthread is running, use it.
if (READ_ONCE(rtp->kthread_ptr)) {
- wait_rcu_gp(rtp->call_func);
+ wait_rcu_gp_state(rtp->wait_state, rtp->call_func);
return;
}
rcu_tasks_one_gp(rtp, true);
@@ -1160,6 +1163,7 @@ static int __init rcu_spawn_tasks_kthread(void)
rcu_tasks.postscan_func = rcu_tasks_postscan;
rcu_tasks.holdouts_func = check_all_holdout_tasks;
rcu_tasks.postgp_func = rcu_tasks_postgp;
+ rcu_tasks.wait_state = TASK_IDLE;
rcu_spawn_tasks_kthread_generic(&rcu_tasks);
return 0;
}
@@ -1178,6 +1182,13 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data);
+
/*
* Protect against tasklist scan blind spot while the task is exiting and
* may be removed from the tasklist. Do this by adding the task to yet
@@ -1199,8 +1210,7 @@ void exit_tasks_rcu_start(void)
rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
t->rcu_tasks_exit_cpu = smp_processor_id();
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- if (!rtpcp->rtp_exit_list.next)
- INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ WARN_ON_ONCE(!rtpcp->rtp_exit_list.next);
list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
preempt_enable();
@@ -1358,6 +1368,13 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -1457,6 +1474,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v)
/*
* Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
* the four-byte operand-size restriction of some platforms.
+ *
* Returns the old value, which is often ignored.
*/
u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
@@ -1468,7 +1486,14 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
if (trs_old.b.need_qs != old)
return trs_old.b.need_qs;
trs_new.b.need_qs = new;
- ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+
+ // Although cmpxchg() appears to KCSAN to update all four bytes,
+ // only the .b.need_qs byte actually changes.
+ instrument_atomic_read_write(&t->trc_reader_special.b.need_qs,
+ sizeof(t->trc_reader_special.b.need_qs));
+ // Avoid false-positive KCSAN failures.
+ ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s));
+
return ret.b.need_qs;
}
EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
@@ -1994,7 +2019,7 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu",
data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
@@ -2010,6 +2035,13 @@ struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 705c0d16850a..4402d6f5f857 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -130,9 +130,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- local_bh_disable();
rcu_reclaim_tiny(list);
- local_bh_enable();
list = next;
}
}
@@ -155,7 +153,9 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ preempt_disable();
WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d9642dd06c25..28c7031711a3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -75,6 +75,7 @@
#define MODULE_PARAM_PREFIX "rcutree."
/* Data structures. */
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.gpwrap = true,
@@ -93,6 +94,8 @@ static struct rcu_state rcu_state = {
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
+ .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
+ rcu_sr_normal_gp_cleanup_work),
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -240,8 +243,36 @@ static long rcu_get_n_cbs_cpu(int cpu)
return 0;
}
+/**
+ * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing
+ *
+ * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU.
+ * This is a special-purpose function to be used in the softirq
+ * infrastructure and perhaps the occasional long-running softirq
+ * handler.
+ *
+ * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is
+ * equivalent to momentarily completely enabling preemption. For
+ * example, given this code::
+ *
+ * local_bh_disable();
+ * do_something();
+ * rcu_softirq_qs(); // A
+ * do_something_else();
+ * local_bh_enable(); // B
+ *
+ * A call to synchronize_rcu() that began concurrently with the
+ * call to do_something() would be guaranteed to wait only until
+ * execution reached statement A. Without that rcu_softirq_qs(),
+ * that same synchronize_rcu() would instead be guaranteed to wait
+ * until execution reached statement B.
+ */
void rcu_softirq_qs(void)
{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal rcu_softirq_qs() in RCU read-side critical section");
rcu_qs();
rcu_preempt_deferred_qs(current);
rcu_tasks_qs(current, false);
@@ -508,17 +539,10 @@ static struct rcu_node *rcu_get_root(void)
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq)
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
- switch (test_type) {
- case RCU_FLAVOR:
- *flags = READ_ONCE(rcu_state.gp_flags);
- *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
- break;
- default:
- break;
- }
+ *flags = READ_ONCE(rcu_state.gp_flags);
+ *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
@@ -813,8 +837,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
- (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
- (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
return 1; /* Break things loose after complaining. */
}
@@ -1423,6 +1447,305 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
}
/*
+ * There is a single llist, which is used for handling
+ * synchronize_rcu() users' enqueued rcu_synchronize nodes.
+ * Within this llist, there are two tail pointers:
+ *
+ * wait tail: Tracks the set of nodes, which need to
+ * wait for the current GP to complete.
+ * done tail: Tracks the set of nodes, for which grace
+ * period has elapsed. These nodes processing
+ * will be done as part of the cleanup work
+ * execution by a kworker.
+ *
+ * At every grace period init, a new wait node is added
+ * to the llist. This wait node is used as wait tail
+ * for this new grace period. Given that there are a fixed
+ * number of wait nodes, if all wait nodes are in use
+ * (which can happen when kworker callback processing
+ * is delayed) and additional grace period is requested.
+ * This means, a system is slow in processing callbacks.
+ *
+ * TODO: If a slow processing is detected, a first node
+ * in the llist should be used as a wait-tail for this
+ * grace period, therefore users which should wait due
+ * to a slow process are handled by _this_ grace period
+ * and not next.
+ *
+ * Below is an illustration of how the done and wait
+ * tail pointers move from one set of rcu_synchronize nodes
+ * to the other, as grace periods start and finish and
+ * nodes are processed by kworker.
+ *
+ *
+ * a. Initial llist callbacks list:
+ *
+ * +----------+ +--------+ +-------+
+ * | | | | | |
+ * | head |---------> | cb2 |--------->| cb1 |
+ * | | | | | |
+ * +----------+ +--------+ +-------+
+ *
+ *
+ *
+ * b. New GP1 Start:
+ *
+ * WAIT TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * c. GP completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * d. New callbacks and GP2 start:
+ *
+ * WAIT TAIL DONE TAIL
+ * | |
+ * | |
+ * v v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ *
+ * e. GP2 completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ * While the llist state transitions from d to e, a kworker
+ * can start executing rcu_sr_normal_gp_cleanup_work() and
+ * can observe either the old done tail (@c) or the new
+ * done tail (@e). So, done tail updates and reads need
+ * to use the rel-acq semantics. If the concurrent kworker
+ * observes the old done tail, the newly queued work
+ * execution will process the updated done tail. If the
+ * concurrent kworker observes the new done tail, then
+ * the newly queued work will skip processing the done
+ * tail, as workqueue semantics guarantees that the new
+ * work is executed only after the previous one completes.
+ *
+ * f. kworker callbacks processing complete:
+ *
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+
+ * | | | |
+ * | head ------> wait |
+ * | | | head2 |
+ * +----------+ +--------+
+ *
+ */
+static bool rcu_sr_is_wait_head(struct llist_node *node)
+{
+ return &(rcu_state.srs_wait_nodes)[0].node <= node &&
+ node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node;
+}
+
+static struct llist_node *rcu_sr_get_wait_head(void)
+{
+ struct sr_wait_node *sr_wn;
+ int i;
+
+ for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) {
+ sr_wn = &(rcu_state.srs_wait_nodes)[i];
+
+ if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1))
+ return &sr_wn->node;
+ }
+
+ return NULL;
+}
+
+static void rcu_sr_put_wait_head(struct llist_node *node)
+{
+ struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node);
+
+ atomic_set_release(&sr_wn->inuse, 0);
+}
+
+/* Disabled by default. */
+static int rcu_normal_wake_from_gp;
+module_param(rcu_normal_wake_from_gp, int, 0644);
+static struct workqueue_struct *sync_wq;
+
+static void rcu_sr_normal_complete(struct llist_node *node)
+{
+ struct rcu_synchronize *rs = container_of(
+ (struct rcu_head *) node, struct rcu_synchronize, head);
+ unsigned long oldstate = (unsigned long) rs->head.func;
+
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ !poll_state_synchronize_rcu(oldstate),
+ "A full grace period is not passed yet: %lu",
+ rcu_seq_diff(get_state_synchronize_rcu(), oldstate));
+
+ /* Finally. */
+ complete(&rs->completion);
+}
+
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
+{
+ struct llist_node *done, *rcu, *next, *head;
+
+ /*
+ * This work execution can potentially execute
+ * while a new done tail is being updated by
+ * grace period kthread in rcu_sr_normal_gp_cleanup().
+ * So, read and updates of done tail need to
+ * follow acq-rel semantics.
+ *
+ * Given that wq semantics guarantees that a single work
+ * cannot execute concurrently by multiple kworkers,
+ * the done tail list manipulations are protected here.
+ */
+ done = smp_load_acquire(&rcu_state.srs_done_tail);
+ if (!done)
+ return;
+
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
+ head = done->next;
+ done->next = NULL;
+
+ /*
+ * The dummy node, which is pointed to by the
+ * done tail which is acq-read above is not removed
+ * here. This allows lockless additions of new
+ * rcu_synchronize nodes in rcu_sr_normal_add_req(),
+ * while the cleanup work executes. The dummy
+ * nodes is removed, in next round of cleanup
+ * work execution.
+ */
+ llist_for_each_safe(rcu, next, head) {
+ if (!rcu_sr_is_wait_head(rcu)) {
+ rcu_sr_normal_complete(rcu);
+ continue;
+ }
+
+ rcu_sr_put_wait_head(rcu);
+ }
+}
+
+/*
+ * Helper function for rcu_gp_cleanup().
+ */
+static void rcu_sr_normal_gp_cleanup(void)
+{
+ struct llist_node *wait_tail, *next, *rcu;
+ int done = 0;
+
+ wait_tail = rcu_state.srs_wait_tail;
+ if (wait_tail == NULL)
+ return;
+
+ rcu_state.srs_wait_tail = NULL;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
+
+ /*
+ * Process (a) and (d) cases. See an illustration.
+ */
+ llist_for_each_safe(rcu, next, wait_tail->next) {
+ if (rcu_sr_is_wait_head(rcu))
+ break;
+
+ rcu_sr_normal_complete(rcu);
+ // It can be last, update a next on this step.
+ wait_tail->next = next;
+
+ if (++done == SR_MAX_USERS_WAKE_FROM_GP)
+ break;
+ }
+
+ // concurrent sr_normal_gp_cleanup work might observe this update.
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+
+ /*
+ * We schedule a work in order to perform a final processing
+ * of outstanding users(if still left) and releasing wait-heads
+ * added by rcu_sr_normal_gp_init() call.
+ */
+ queue_work(sync_wq, &rcu_state.srs_cleanup_work);
+}
+
+/*
+ * Helper function for rcu_gp_init().
+ */
+static bool rcu_sr_normal_gp_init(void)
+{
+ struct llist_node *first;
+ struct llist_node *wait_head;
+ bool start_new_poll = false;
+
+ first = READ_ONCE(rcu_state.srs_next.first);
+ if (!first || rcu_sr_is_wait_head(first))
+ return start_new_poll;
+
+ wait_head = rcu_sr_get_wait_head();
+ if (!wait_head) {
+ // Kick another GP to retry.
+ start_new_poll = true;
+ return start_new_poll;
+ }
+
+ /* Inject a wait-dummy-node. */
+ llist_add(wait_head, &rcu_state.srs_next);
+
+ /*
+ * A waiting list of rcu_synchronize nodes should be empty on
+ * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(),
+ * rolls it over. If not, it is a BUG, warn a user.
+ */
+ WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL);
+ rcu_state.srs_wait_tail = wait_head;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+
+ return start_new_poll;
+}
+
+static void rcu_sr_normal_add_req(struct rcu_synchronize *rs)
+{
+ llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next);
+}
+
+/*
* Initialize a new grace period. Return false if no grace period required.
*/
static noinline_for_stack bool rcu_gp_init(void)
@@ -1432,10 +1755,11 @@ static noinline_for_stack bool rcu_gp_init(void)
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
+ bool start_new_poll;
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
- if (!READ_ONCE(rcu_state.gp_flags)) {
+ if (!rcu_state.gp_flags) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq_rcu_node(rnp);
return false;
@@ -1456,11 +1780,25 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Record GP times before starting GP, hence rcu_seq_start(). */
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
+ start_new_poll = rcu_sr_normal_gp_init();
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
+ * The "start_new_poll" is set to true, only when this GP is not able
+ * to handle anything and there are outstanding users. It happens when
+ * the rcu_sr_normal_gp_init() function was not able to insert a dummy
+ * separator to the llist, because there were no left any dummy-nodes.
+ *
+ * Number of dummy-nodes is fixed, it could be that we are run out of
+ * them, if so we start a new pool request to repeat a try. It is rare
+ * and it means that a system is doing a slow processing of callbacks.
+ */
+ if (start_new_poll)
+ (void) start_poll_synchronize_rcu();
+
+ /*
* Apply per-leaf buffered online and offline operations to
* the rcu_node tree. Note that this new grace period need not
* wait for subsequent online CPUs, and that RCU hooks in the CPU
@@ -1620,8 +1958,7 @@ static void rcu_gp_fqs(bool first_time)
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq_rcu_node(rnp);
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq_rcu_node(rnp);
}
}
@@ -1825,6 +2162,9 @@ static noinline void rcu_gp_cleanup(void)
}
raw_spin_unlock_irq_rcu_node(rnp);
+ // Make synchronize_rcu() users aware of the end of old grace period.
+ rcu_sr_normal_gp_cleanup();
+
// If strict, make all CPUs aware of the end of the old grace period.
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
@@ -1882,8 +2222,7 @@ static void rcu_report_qs_rsp(unsigned long flags)
{
raw_lockdep_assert_held_rcu_node(rcu_get_root());
WARN_ON_ONCE(!rcu_gp_in_progress());
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
rcu_gp_kthread_wake();
}
@@ -2398,8 +2737,7 @@ void rcu_force_quiescent_state(void)
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
rcu_gp_kthread_wake();
}
@@ -3559,6 +3897,43 @@ static int rcu_blocking_is_gp(void)
return true;
}
+/*
+ * Helper function for the synchronize_rcu() API.
+ */
+static void synchronize_rcu_normal(void)
+{
+ struct rcu_synchronize rs;
+
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
+
+ if (!READ_ONCE(rcu_normal_wake_from_gp)) {
+ wait_rcu_gp(call_rcu_hurry);
+ goto trace_complete_out;
+ }
+
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
+
+ /*
+ * This code might be preempted, therefore take a GP
+ * snapshot before adding a request.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ rs.head.func = (void *) get_state_synchronize_rcu();
+
+ rcu_sr_normal_add_req(&rs);
+
+ /* Kick a GP and start waiting. */
+ (void) start_poll_synchronize_rcu();
+
+ /* Now we can wait. */
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
+
+trace_complete_out:
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete"));
+}
+
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -3610,7 +3985,7 @@ void synchronize_rcu(void)
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
- wait_rcu_gp(call_rcu_hurry);
+ synchronize_rcu_normal();
return;
}
@@ -4303,7 +4678,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
// whether spinlocks may be acquired safely.
static bool rcu_init_invoked(void)
{
- return !!rcu_state.n_online_cpus;
+ return !!READ_ONCE(rcu_state.n_online_cpus);
}
/*
@@ -4395,9 +4770,9 @@ rcu_boot_init_percpu_data(int cpu)
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
- rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
+ rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
- rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -4513,6 +4888,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
return 0;
@@ -4656,7 +5032,7 @@ void rcutree_report_cpu_starting(unsigned int cpu)
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
- rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
+ rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state);
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
@@ -4707,7 +5083,7 @@ void rcutree_report_cpu_dead(void)
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
- rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
+ rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
rcu_disable_urgency_upon_qs(rdp);
@@ -4781,6 +5157,7 @@ void rcutree_migrate_callbacks(int cpu)
*/
int rcutree_dead_cpu(unsigned int cpu)
{
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -5229,6 +5606,9 @@ void __init rcu_init(void)
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
+ sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!sync_wq);
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df48160b3136..bae7925c497f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -273,9 +273,9 @@ struct rcu_data {
bool rcu_iw_pending; /* Is ->rcu_iw pending? */
unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */
unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */
- short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */
+ short rcu_ofl_gp_state; /* ->gp_state at last offline. */
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
- short rcu_onl_gp_flags; /* ->gp_flags at last online. */
+ short rcu_onl_gp_state; /* ->gp_state at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
@@ -316,6 +316,19 @@ do { \
} while (0)
/*
+ * A max threshold for synchronize_rcu() users which are
+ * awaken directly by the rcu_gp_kthread(). Left part is
+ * deferred to the main worker.
+ */
+#define SR_MAX_USERS_WAKE_FROM_GP 5
+#define SR_NORMAL_GP_WAIT_HEAD_MAX 5
+
+struct sr_wait_node {
+ atomic_t inuse;
+ struct llist_node node;
+};
+
+/*
* RCU global state, including node hierarchy. This hierarchy is
* represented in "heap" form in a dense array. The root (first level)
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
@@ -400,6 +413,13 @@ struct rcu_state {
/* Synchronize offline with */
/* GP pre-initialization. */
int nocb_is_setup; /* nocb is setup from boot */
+
+ /* synchronize_rcu() part. */
+ struct llist_head srs_next; /* request a GP users. */
+ struct llist_node *srs_wait_tail; /* wait for GP users. */
+ struct llist_node *srs_done_tail; /* ready for GP users. */
+ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
+ struct work_struct srs_cleanup_work;
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6b83537480b1..8a1d9c8bd9f7 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -930,7 +930,7 @@ void synchronize_rcu_expedited(void)
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu_hurry);
+ synchronize_rcu_normal();
return;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 36a8b5dbf5b5..340bbefe5f65 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -805,8 +805,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
rdp = per_cpu_ptr(&rcu_data, cpu);
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
cpu, ".o"[rcu_rdp_cpu_online(rdp)],
- (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
- (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
}
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5d666428546b..460efecd077b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -504,7 +504,8 @@ static void print_cpu_stall_info(int cpu)
rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
- sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ // Print signed value, as negative values indicate a probable bug.
+ snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j);
pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
@@ -579,7 +580,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
rcu_state.name, (jiffies - jiffies_fqs),
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
+ data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
data_race(READ_ONCE(gpk->__state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
@@ -628,7 +629,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -689,7 +691,8 @@ static void print_cpu_stall(unsigned long gps)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 46aaaa9fe339..f8436969e0c8 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head)
}
EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array,
struct rcu_synchronize *rs_array)
{
int i;
@@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
if (crcu_array[j] == crcu_array[i])
break;
if (j == i) {
- wait_for_completion(&rs_array[i].completion);
+ wait_for_completion_state(&rs_array[i].completion, state);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 47345bf1d4a9..b3d7f62ac581 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -163,7 +163,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
config GENERIC_TRACER
bool
@@ -204,7 +204,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da1710499698..6c96b30f3d63 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3157,8 +3157,7 @@ out:
* synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- synchronize_rcu_tasks();
+ synchronize_rcu_tasks();
ftrace_trampoline_free(ops);
}