summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/btf.c5
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/helpers.c12
-rw-r--r--kernel/bpf/verifier.c113
-rw-r--r--kernel/cfi.c22
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup-v1.c17
-rw-r--r--kernel/cgroup/cgroup.c173
-rw-r--r--kernel/cgroup/rstat.c44
-rw-r--r--kernel/configs/x86_debug.config3
-rw-r--r--kernel/context_tracking.c617
-rw-r--r--kernel/cpu_pm.c8
-rw-r--r--kernel/dma/direct.c5
-rw-r--r--kernel/entry/common.c16
-rw-r--r--kernel/events/core.c66
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/groups.c13
-rw-r--r--kernel/hung_task.c11
-rw-r--r--kernel/irq/Kconfig2
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/generic-chip.c2
-rw-r--r--kernel/irq/ipi.c16
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c14
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/jump_label.c41
-rw-r--r--kernel/kcsan/.kunitconfig24
-rw-r--r--kernel/kexec_core.c27
-rw-r--r--kernel/kexec_file.c94
-rw-r--r--kernel/kthread.c14
-rw-r--r--kernel/locking/lockdep.c11
-rw-r--r--kernel/locking/rwsem.c30
-rw-r--r--kernel/module/internal.h13
-rw-r--r--kernel/module/kallsyms.c35
-rw-r--r--kernel/module/main.c22
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/power/energy_model.c24
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/power/swap.c29
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/printk/printk.c600
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Kconfig31
-rw-r--r--kernel/rcu/Kconfig.debug2
-rw-r--r--kernel/rcu/rcu.h19
-rw-r--r--kernel/rcu/rcuscale.c1
-rw-r--r--kernel/rcu/rcutorture.c247
-rw-r--r--kernel/rcu/refscale.c18
-rw-r--r--kernel/rcu/srcutree.c98
-rw-r--r--kernel/rcu/tasks.h541
-rw-r--r--kernel/rcu/tiny.c25
-rw-r--r--kernel/rcu/tree.c660
-rw-r--r--kernel/rcu/tree.h21
-rw-r--r--kernel/rcu/tree_exp.h115
-rw-r--r--kernel/rcu/tree_nocb.h266
-rw-r--r--kernel/rcu/tree_plugin.h82
-rw-r--r--kernel/rcu/tree_stall.h57
-rw-r--r--kernel/rcu/update.c15
-rw-r--r--kernel/reboot.c14
-rw-r--r--kernel/rseq.c23
-rw-r--r--kernel/sched/core.c287
-rw-r--r--kernel/sched/core_sched.c15
-rw-r--r--kernel/sched/cpufreq_schedutil.c5
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/deadline.c11
-rw-r--r--kernel/sched/fair.c818
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/pelt.h40
-rw-r--r--kernel/sched/psi.c19
-rw-r--r--kernel/sched/rt.c15
-rw-r--r--kernel/sched/sched.h69
-rw-r--r--kernel/sched/topology.c23
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/smp.c4
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sysctl.c57
-rw-r--r--kernel/time/Kconfig37
-rw-r--r--kernel/time/posix-timers.c19
-rw-r--r--kernel/time/tick-sched.c3
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/blktrace.c75
-rw-r--r--kernel/trace/bpf_trace.c60
-rw-r--r--kernel/trace/ftrace.c13
-rw-r--r--kernel/trace/rethook.c9
-rw-r--r--kernel/trace/trace.c21
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_kprobe.c11
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--kernel/watch_queue.c105
-rw-r--r--kernel/watchdog.c4
-rw-r--r--kernel/watchdog_hld.c4
-rw-r--r--kernel/workqueue.c5
103 files changed, 3558 insertions, 2675 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 7690c29d4ee4..a75978ae38ad 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1100,7 +1100,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
audit_log_common_recv_msg(NULL, ab, msg_type);
}
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
@@ -1390,7 +1390,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
str);
} else {
audit_log_format(ab, " data=");
- if (data_len > 0 && str[data_len - 1] == '\0')
+ if (str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f3a2abd6d1a1..3a8c9d744800 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1014,10 +1014,10 @@ static void audit_reset_context(struct audit_context *ctx)
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
- ctx->type = 0;
audit_free_module(ctx);
ctx->fds[0] = -1;
audit_proctitle_free(ctx);
+ ctx->type = 0; /* reset last for audit_free_*() */
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63d0ac7dfe2f..eb12d4f705cc 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4815,6 +4815,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
n = btf_nr_types(btf);
for (i = start_id; i < n; i++) {
const struct btf_type *t;
+ int chain_limit = 32;
u32 cur_id = i;
t = btf_type_by_id(btf, i);
@@ -4827,6 +4828,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
in_tags = btf_type_is_type_tag(t);
while (btf_type_is_modifier(t)) {
+ if (!chain_limit--) {
+ btf_verifier_log(env, "Max chain length or cycle detected");
+ return -ELOOP;
+ }
if (btf_type_is_type_tag(t)) {
if (!in_tags) {
btf_verifier_log(env, "Type tags don't precede modifiers");
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5f6f3f829b36..e7961508a47d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -68,11 +68,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 225806a02efb..bb1254f07667 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
-BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+ u32, offset, u64, flags)
{
int err;
- if (!src->data)
+ if (!src->data || flags)
return -EINVAL;
err = bpf_dynptr_check_off_len(src, offset, len);
@@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_DYNPTR,
.arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+ u32, len, u64, flags)
{
int err;
- if (!dst->data || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aedac2ac02b9..0efbac0fd126 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1562,6 +1562,21 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static void reg_bounds_sync(struct bpf_reg_state *reg)
+{
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(reg);
+}
+
static bool __reg32_bound_s64(s32 a)
{
return a >= 0 && a <= S32_MAX;
@@ -1603,16 +1618,8 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
* so they do not impact tnum bounds calculation.
*/
__mark_reg64_unbounded(reg);
- __update_reg_bounds(reg);
}
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ reg_bounds_sync(reg);
}
static bool __reg64_bound_s32(s64 a)
@@ -1628,7 +1635,6 @@ static bool __reg64_bound_u32(u64 a)
static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
{
__mark_reg32_unbounded(reg);
-
if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
reg->s32_min_value = (s32)reg->smin_value;
reg->s32_max_value = (s32)reg->smax_value;
@@ -1637,14 +1643,7 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
reg->u32_min_value = (u32)reg->umin_value;
reg->u32_max_value = (u32)reg->umax_value;
}
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ reg_bounds_sync(reg);
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -6943,9 +6942,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
ret_reg->s32_max_value = meta->msize_max_value;
ret_reg->smin_value = -MAX_ERRNO;
ret_reg->s32_min_value = -MAX_ERRNO;
- __reg_deduce_bounds(ret_reg);
- __reg_bound_offset(ret_reg);
- __update_reg_bounds(ret_reg);
+ reg_bounds_sync(ret_reg);
}
static int
@@ -8202,11 +8199,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
-
+ reg_bounds_sync(dst_reg);
if (sanitize_check_bounds(env, insn, dst_reg) < 0)
return -EACCES;
if (sanitize_needed(opcode)) {
@@ -8944,10 +8937,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* ALU32 ops are zero extended into 64bit register */
if (alu32)
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
return 0;
}
@@ -9136,10 +9126,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->dst_reg);
}
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
}
} else {
/* case: R = imm
@@ -9577,26 +9564,33 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
return;
switch (opcode) {
+ /* JEQ/JNE comparison doesn't change the register equivalence.
+ *
+ * r1 = r2;
+ * if (r1 == 42) goto label;
+ * ...
+ * label: // here both r1 and r2 are known to be 42.
+ *
+ * Hence when marking register as known preserve it's ID.
+ */
case BPF_JEQ:
+ if (is_jmp32) {
+ __mark_reg32_known(true_reg, val32);
+ true_32off = tnum_subreg(true_reg->var_off);
+ } else {
+ ___mark_reg_known(true_reg, val);
+ true_64off = true_reg->var_off;
+ }
+ break;
case BPF_JNE:
- {
- struct bpf_reg_state *reg =
- opcode == BPF_JEQ ? true_reg : false_reg;
-
- /* JEQ/JNE comparison doesn't change the register equivalence.
- * r1 = r2;
- * if (r1 == 42) goto label;
- * ...
- * label: // here both r1 and r2 are known to be 42.
- *
- * Hence when marking register as known preserve it's ID.
- */
- if (is_jmp32)
- __mark_reg32_known(reg, val32);
- else
- ___mark_reg_known(reg, val);
+ if (is_jmp32) {
+ __mark_reg32_known(false_reg, val32);
+ false_32off = tnum_subreg(false_reg->var_off);
+ } else {
+ ___mark_reg_known(false_reg, val);
+ false_64off = false_reg->var_off;
+ }
break;
- }
case BPF_JSET:
if (is_jmp32) {
false_32off = tnum_and(false_32off, tnum_const(~val32));
@@ -9735,21 +9729,8 @@ static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
dst_reg->smax_value);
src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
dst_reg->var_off);
- /* We might have learned new bounds from the var_off. */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
- /* We might have learned something about the sign bit. */
- __reg_deduce_bounds(src_reg);
- __reg_deduce_bounds(dst_reg);
- /* We might have learned some bits from the bounds. */
- __reg_bound_offset(src_reg);
- __reg_bound_offset(dst_reg);
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
+ reg_bounds_sync(src_reg);
+ reg_bounds_sync(dst_reg);
}
static void reg_combine_min_max(struct bpf_reg_state *true_src,
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 9594cfd1cf2c..2046276ee234 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -281,6 +281,8 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
static inline cfi_check_fn find_check_fn(unsigned long ptr)
{
cfi_check_fn fn = NULL;
+ unsigned long flags;
+ bool rcu_idle;
if (is_kernel_text(ptr))
return __cfi_check;
@@ -290,13 +292,21 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr)
* the shadow and __module_address use RCU, so we need to wake it
* up if necessary.
*/
- RCU_NONIDLE({
- if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
- fn = find_shadow_check_fn(ptr);
+ rcu_idle = !rcu_is_watching();
+ if (rcu_idle) {
+ local_irq_save(flags);
+ ct_irq_enter();
+ }
+
+ if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
+ fn = find_shadow_check_fn(ptr);
+ if (!fn)
+ fn = find_module_check_fn(ptr);
- if (!fn)
- fn = find_module_check_fn(ptr);
- });
+ if (rcu_idle) {
+ ct_irq_exit();
+ local_irq_restore(flags);
+ }
return fn;
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5da09c74228d..36b740cb3d59 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -233,6 +233,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index afc6c0e9c966..2ade21b54dc4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -875,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -898,6 +900,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -909,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@@ -960,6 +966,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
@@ -1211,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
- if (ret)
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
cgroup_free_root(root);
+
return ret;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..ffaccd6373f1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -279,8 +279,6 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
@@ -765,7 +763,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -1240,7 +1239,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1307,6 +1307,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ if (favor && !favoring) {
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1367,6 +1381,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_root_count--;
}
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
@@ -1376,6 +1391,31 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_free_root(root);
}
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res_cgroup = NULL;
+
+ if (cset == &init_css_set) {
+ res_cgroup = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res_cgroup = cset->dfl_cgrp;
+ } else {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == root) {
+ res_cgroup = c;
+ break;
+ }
+ }
+ }
+
+ return res_cgroup;
+}
+
/*
* look up cgroup associated with current task's cgroup namespace on the
* specified hierarchy
@@ -1391,22 +1431,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_lock();
cset = current->nsproxy->cgroup_ns->root_cset;
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ res = __cset_cgroup_from_root(cset, root);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
rcu_read_unlock();
BUG_ON(!res);
@@ -1422,22 +1448,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
-
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ res = __cset_cgroup_from_root(cset, root);
BUG_ON(!res);
return res;
@@ -1864,6 +1875,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
nr__cgroup2_params
@@ -1871,6 +1883,7 @@ enum cgroup2_param {
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
@@ -1890,6 +1903,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
@@ -1908,6 +1924,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
@@ -1924,6 +1943,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
@@ -1974,7 +1995,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -2196,6 +2218,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+#endif
return 0;
}
@@ -2570,10 +2596,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2597,21 +2619,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2651,7 +2679,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2664,7 +2692,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2689,7 +2717,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2708,7 +2736,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2716,8 +2744,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2941,29 +2969,48 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ if (has_tasks)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2975,7 +3022,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ if (has_tasks)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -3609,21 +3657,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
return psi_show(seq, psi, PSI_CPU);
}
@@ -3649,7 +3697,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
- psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
@@ -5842,12 +5890,6 @@ int __init cgroup_init(void)
cgroup_rstat_boot();
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@@ -6759,6 +6801,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
+ "favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n");
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 24b5c2ab5598..feb59380c896 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -310,6 +310,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -318,6 +321,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -398,6 +404,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
@@ -411,8 +422,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
cputime->stime = 0;
@@ -438,6 +450,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
}
}
@@ -445,27 +461,43 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
cgroup_rstat_flush_release();
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
}
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index dcd86f32f4ed..6fac5b405334 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -7,12 +7,11 @@ CONFIG_DEBUG_SLAB=y
CONFIG_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_PAGEALLOC=y
CONFIG_SLUB_DEBUG_ON=y
-CONFIG_KMEMCHECK=y
CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
CONFIG_GCOV_KERNEL=y
CONFIG_LOCKDEP=y
CONFIG_PROVE_LOCKING=y
CONFIG_SCHEDSTATS=y
-CONFIG_VMLINUX_VALIDATION=y
+CONFIG_NOINSTR_VALIDATION=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 36a98c48aedc..77978e372377 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Context tracking: Probe on high level context boundaries such as kernel
- * and userspace. This includes syscalls and exceptions entry/exit.
+ * Context tracking: Probe on high level context boundaries such as kernel,
+ * userspace, guest or idle.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
- * runs in userspace.
+ * runs in idle, userspace or guest mode.
*
- * Started by Frederic Weisbecker:
+ * User/guest tracking started by Frederic Weisbecker:
*
- * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
+ * RCU extended quiescent state bits imported from kernel/rcu/tree.c
+ * where the relevant authorship may be found.
*/
#include <linux/context_tracking.h>
@@ -21,6 +23,411 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <trace/events/rcu.h>
+
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+ .dynticks_nesting = 1,
+ .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
+#endif
+ .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
+};
+EXPORT_SYMBOL_GPL(context_tracking);
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x) tracepoint_string(x)
+
+/* Record the current task on dyntick-idle entry. */
+static __always_inline void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static __always_inline void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
+static __always_inline void rcu_dynticks_task_trace_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
+static __always_inline void rcu_dynticks_task_trace_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void ct_kernel_exit_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
+ seq = ct_state_inc(offset);
+ // RCU is no longer watching. Better be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void ct_kernel_enter_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = ct_state_inc(offset);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_dynticks_task_trace_exit(); // After ->dynticks update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr ct_kernel_exit(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ ct_dynticks_nesting() == 0);
+ if (ct_dynticks_nesting() != 1) {
+ // RCU will still be watching, so just do accounting and leave.
+ ct->dynticks_nesting--;
+ return;
+ }
+
+ instrumentation_begin();
+ lockdep_assert_irqs_disabled();
+ trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
+ ct_kernel_exit_state(offset);
+ // ... but is no longer watching here.
+ rcu_dynticks_task_enter();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr ct_kernel_enter(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ long oldval;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ oldval = ct_dynticks_nesting();
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+ if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
+ ct->dynticks_nesting++;
+ return;
+ }
+ rcu_dynticks_task_exit();
+ // RCU is not watching here ...
+ ct_kernel_enter_state(offset);
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(ct->dynticks_nesting, 1);
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+ instrumentation_end();
+}
+
+/**
+ * ct_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_exit(void)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ instrumentation_begin();
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (ct_dynticks_nmi_nesting() != 1) {
+ trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
+ ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
+ ct_dynticks_nmi_nesting() - 2);
+ instrumentation_end();
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+ instrumentation_end();
+
+ // RCU is watching here ...
+ ct_kernel_exit_state(RCU_DYNTICKS_IDX);
+ // ... but is no longer watching here.
+
+ if (!in_nmi())
+ rcu_dynticks_task_enter();
+}
+
+/**
+ * ct_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
+ * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_enter(void)
+{
+ long incby = 2;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+
+ if (!in_nmi())
+ rcu_dynticks_task_exit();
+
+ // RCU is not watching here ...
+ ct_kernel_enter_state(RCU_DYNTICKS_IDX);
+ // ... but is watching here.
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ incby = 1;
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ } else {
+ instrumentation_begin();
+ }
+
+ trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+ ct_dynticks_nmi_nesting(),
+ ct_dynticks_nmi_nesting() + incby, ct_dynticks());
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
+ ct_dynticks_nmi_nesting() + incby);
+ barrier();
+}
+
+/**
+ * ct_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_enter(void)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+/**
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_exit(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_enter();
+}
+
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_exit();
+}
+
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_enter();
+ local_irq_restore(flags);
+}
+
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_exit();
+ local_irq_restore(flags);
+}
+#else
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -28,9 +435,6 @@
DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
-EXPORT_SYMBOL_GPL(context_tracking);
-
static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -51,29 +455,32 @@ static __always_inline void context_tracking_recursion_exit(void)
}
/**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- * enter user or guest space mode.
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ * to enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void noinstr __context_tracking_enter(enum ctx_state state)
+void noinstr __ct_user_enter(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ lockdep_assert_irqs_disabled();
+
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
if (!context_tracking_recursion_enter())
return;
- if ( __this_cpu_read(context_tracking.state) != state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() != state) {
+ if (ct->active) {
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
if (state == CONTEXT_USER) {
@@ -82,35 +489,77 @@ void noinstr __context_tracking_enter(enum ctx_state state)
vtime_user_enter(current);
instrumentation_end();
}
- rcu_user_enter();
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
+
+ /*
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_eqs_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+ ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ atomic_add(state, &ct->state);
+ }
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- __this_cpu_write(context_tracking.state, state);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__ct_user_enter);
-void context_tracking_enter(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_restore() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_enter() through user_enter_irqoff()
+ * or context_tracking_guest_enter(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_enter(enum ctx_state state)
{
unsigned long flags;
/*
* Some contexts may involve an exception occuring in an irq,
* leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
* This would mess up the dyntick_nesting count though. And rcu_irq_*()
* helpers are enough to protect RCU uses inside the exception. So
* just return immediately if we detect we are in an IRQ.
@@ -119,21 +568,32 @@ void context_tracking_enter(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_enter(state);
+ __ct_user_enter(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
-void context_tracking_user_enter(void)
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call user_enter_irqoff(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void user_enter_callable(void)
{
user_enter();
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
/**
- * context_tracking_exit - Inform the context tracking that the CPU is
- * exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
@@ -143,32 +603,64 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void noinstr __context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
if (!context_tracking_recursion_enter())
return;
- if (__this_cpu_read(context_tracking.state) == state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() == state) {
+ if (ct->active) {
/*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
*/
- rcu_user_exit();
+ ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
if (state == CONTEXT_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
instrumentation_end();
}
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ atomic_set(&ct->state, CONTEXT_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ atomic_set(&ct->state, CONTEXT_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ atomic_sub(state, &ct->state);
+ }
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
-void context_tracking_exit(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_save() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_exit() through user_exit_irqoff()
+ * or context_tracking_guest_exit(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_exit(enum ctx_state state)
{
unsigned long flags;
@@ -176,19 +668,30 @@ void context_tracking_exit(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_exit(state);
+ __ct_user_exit(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
-void context_tracking_user_exit(void)
+/**
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
+ * involving illegal RCU uses through tracing and lockdep. This is unlikely
+ * to be fixed as this function is obsolete. The preferred way is to call
+ * user_exit_irqoff(). It should be the arch entry code responsibility to
+ * call into context tracking with IRQs disabled.
+ */
+void user_exit_callable(void)
{
user_exit();
}
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(user_exit_callable);
-void __init context_tracking_cpu_set(int cpu)
+void __init ct_cpu_track_user(int cpu)
{
static __initdata bool initialized = false;
@@ -212,12 +715,14 @@ void __init context_tracking_cpu_set(int cpu)
initialized = true;
}
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
}
#endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 246efc74e3f3..ba4ba71facf9 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -35,11 +35,11 @@ static int cpu_pm_notify(enum cpu_pm_event event)
* disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
* this.
*/
- rcu_irq_enter_irqson();
+ ct_irq_enter_irqson();
rcu_read_lock();
ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
rcu_read_unlock();
- rcu_irq_exit_irqson();
+ ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -49,11 +49,11 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
unsigned long flags;
int ret;
- rcu_irq_enter_irqson();
+ ct_irq_enter_irqson();
raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
- rcu_irq_exit_irqson();
+ ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e978f36e6be8..8d0b68a17042 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -357,7 +357,7 @@ void dma_direct_free(struct device *dev, size_t size,
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+ if (dma_set_encrypted(dev, cpu_addr, size))
return;
}
@@ -392,7 +392,6 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
- unsigned int page_order = get_order(size);
void *vaddr = page_address(page);
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
@@ -400,7 +399,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+ if (dma_set_encrypted(dev, vaddr, size))
return;
__dma_direct_free_pages(dev, page, size);
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 032f164abe7c..063068a9ea9b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -321,7 +321,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
}
/*
- * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
@@ -332,12 +332,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
- * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
@@ -350,7 +350,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
- rcu_irq_enter();
+ ct_irq_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -418,7 +418,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
instrumentation_end();
- rcu_irq_exit();
+ ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
@@ -436,7 +436,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
* was not watching on entry.
*/
if (state.exit_rcu)
- rcu_irq_exit();
+ ct_irq_exit();
}
}
@@ -449,7 +449,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
- rcu_nmi_enter();
+ ct_nmi_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
@@ -469,7 +469,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
}
instrumentation_end();
- rcu_nmi_exit();
+ ct_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..c9d32d4d2e20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1819,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
+ if (event->attr.read_format & PERF_FORMAT_LOST)
+ entry += sizeof(u64);
+
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
@@ -5260,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -5321,7 +5328,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5331,6 +5338,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -6253,10 +6262,10 @@ again:
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -6858,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6872,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6882,7 +6893,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ u64 values[6];
int n = 0;
values[n++] = 1 + leader->nr_siblings;
@@ -6900,6 +6911,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6913,6 +6926,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -11825,14 +11840,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11870,8 +11896,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -11881,6 +11914,12 @@ set:
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11888,20 +11927,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb35b926024c..726132039c38 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
diff --git a/kernel/exit.c b/kernel/exit.c
index f072959fcab7..64c938ce36fe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -766,7 +766,7 @@ void __noreturn do_exit(long code)
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
+ exit_itimers(tsk);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
diff --git a/kernel/extable.c b/kernel/extable.c
index bda5e9761541..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -114,7 +114,7 @@ int kernel_text_address(unsigned long addr)
/* Treat this like an NMI as it can happen anywhere */
if (no_rcu)
- rcu_nmi_enter();
+ ct_nmi_enter();
if (is_module_text_address(addr))
goto out;
@@ -127,7 +127,7 @@ int kernel_text_address(unsigned long addr)
ret = 0;
out:
if (no_rcu)
- rcu_nmi_exit();
+ ct_nmi_exit();
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..28772142022a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1814,6 +1814,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->trc_reader_nesting = 0;
p->trc_reader_special.s = 0;
INIT_LIST_HEAD(&p->trc_holdout_list);
+ INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
@@ -2033,8 +2034,11 @@ static __latent_entropy struct task_struct *copy_process(
/*
* If the new process will be in a different time namespace
* do not allow it to share VM or a thread group with the forking task.
+ *
+ * On vfork, the child process enters the target time namespace only
+ * after exec.
*/
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
+ if ((clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) {
if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
diff --git a/kernel/groups.c b/kernel/groups.c
index 787b381c7c00..9aaed2a31073 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -134,13 +134,26 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
+ const struct cred *old;
+ int retval;
new = prepare_creds();
if (!new)
return -ENOMEM;
+ old = current_cred();
+
set_groups(new, group_info);
+
+ retval = security_task_fix_setgroups(new, old);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
EXPORT_SYMBOL(set_current_groups);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 80bfea5dd5c4..cff3ae8c818f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,8 +127,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* complain:
*/
if (sysctl_hung_task_warnings) {
- printk_prefer_direct_enter();
-
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -144,8 +142,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_all_cpu_backtrace)
hung_task_show_all_bt = true;
-
- printk_prefer_direct_exit();
}
touch_nmi_watchdog();
@@ -208,17 +204,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
}
unlock:
rcu_read_unlock();
- if (hung_task_show_lock) {
- printk_prefer_direct_enter();
+ if (hung_task_show_lock)
debug_show_all_locks();
- printk_prefer_direct_exit();
- }
if (hung_task_show_all_bt) {
hung_task_show_all_bt = false;
- printk_prefer_direct_enter();
trigger_all_cpu_backtrace();
- printk_prefer_direct_exit();
}
if (hung_task_call_panic)
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 10929eda9825..db3d174c53d4 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -24,6 +24,7 @@ config GENERIC_IRQ_SHOW_LEVEL
# Supports effective affinity mask
config GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ depends on SMP
bool
# Support for delayed migration from interrupt context
@@ -82,6 +83,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
+ depends on SMP
select IRQ_DOMAIN_HIERARCHY
# Generic MSI interrupt support
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e6b8e564b37f..8ac37e8e738a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -188,7 +188,8 @@ enum {
#ifdef CONFIG_SMP
static int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -224,7 +225,8 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
}
#else
static __always_inline int
-__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
{
return IRQ_STARTUP_NORMAL;
}
@@ -252,7 +254,7 @@ static int __irq_startup(struct irq_desc *desc)
int irq_startup(struct irq_desc *desc, bool resend, bool force)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
- struct cpumask *aff = irq_data_get_affinity_mask(d);
+ const struct cpumask *aff = irq_data_get_affinity_mask(d);
int ret = 0;
desc->depth = 0;
@@ -1006,8 +1008,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
- if (is_chained)
+ if (is_chained) {
desc->action = NULL;
+ WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+ }
desc->depth = 1;
}
desc->handle_irq = handle;
@@ -1033,6 +1037,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
+ WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
irq_activate_and_startup(desc, IRQ_RESEND);
}
}
@@ -1513,7 +1518,8 @@ int irq_chip_request_resources_parent(struct irq_data *data)
if (data->chip->irq_request_resources)
return data->chip->irq_request_resources(data);
- return -ENOSYS;
+ /* no error on missing optional irq_chip::irq_request_resources */
+ return 0;
}
EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index bc8e40cf2b65..bbcaac64038e 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -30,7 +30,7 @@ static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
- struct cpumask *msk;
+ const struct cpumask *msk;
msk = irq_data_get_affinity_mask(data);
seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f0862eb6b506..c653cd31548d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -431,7 +431,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
return 0;
}
-static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
{
struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 08ce7da3b57c..bbd945bacef0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -115,11 +115,11 @@ free_descs:
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
struct irq_domain *domain;
unsigned int nr_irqs;
- if (!irq || !data || !ipimask)
+ if (!irq || !data)
return -EINVAL;
domain = data->domain;
@@ -131,7 +131,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
return -EINVAL;
}
- if (WARN_ON(!cpumask_subset(dest, ipimask)))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
/*
* Must be destroying a subset of CPUs to which this IPI
* was set up to target
@@ -162,12 +163,13 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
struct irq_data *data = irq_get_irq_data(irq);
- struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ const struct cpumask *ipimask;
- if (!data || !ipimask || cpu >= nr_cpu_ids)
+ if (!data || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
- if (!cpumask_test_cpu(cpu, ipimask))
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
return INVALID_HWIRQ;
/*
@@ -186,7 +188,7 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);
static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
const struct cpumask *dest, unsigned int cpu)
{
- struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+ const struct cpumask *ipimask = irq_data_get_affinity_mask(data);
if (!chip || !ipimask)
return -EINVAL;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d323b180b0f3..5db0230aa6b5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -251,7 +251,7 @@ static ssize_t actions_show(struct kobject *kobj,
char *p = "";
raw_spin_lock_irq(&desc->lock);
- for (action = desc->action; action != NULL; action = action->next) {
+ for_each_action_of_desc(desc, action) {
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
p, action->name);
p = ",";
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d5ce96510549..8fe1da9614ee 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -147,7 +147,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s
static atomic_t unknown_domains;
if (WARN_ON((size && direct_max) ||
- (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max)))
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max) ||
+ (direct_max && (direct_max != hwirq_max))))
return NULL;
domain = kzalloc_node(struct_size(domain, revmap, size),
@@ -219,7 +220,6 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s
domain->hwirq_max = hwirq_max;
if (direct_max) {
- size = direct_max;
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
}
@@ -650,9 +650,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
pr_debug("create_direct virq allocation failed\n");
return 0;
}
- if (virq >= domain->revmap_size) {
- pr_err("ERROR: no free irqs available below %i maximum\n",
- domain->revmap_size);
+ if (virq >= domain->hwirq_max) {
+ pr_err("ERROR: no free irqs available below %lu maximum\n",
+ domain->hwirq_max);
irq_free_desc(virq);
return 0;
}
@@ -906,10 +906,12 @@ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
return desc;
if (irq_domain_is_nomap(domain)) {
- if (hwirq < domain->revmap_size) {
+ if (hwirq < domain->hwirq_max) {
data = irq_domain_get_irq_data(domain, hwirq);
if (data && data->hwirq == hwirq)
desc = irq_data_to_desc(data);
+ if (irq && desc)
+ *irq = hwirq;
}
return desc;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c396319d5ac..40fe7806cc8c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -205,16 +205,8 @@ static void irq_validate_effective_affinity(struct irq_data *data)
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
}
-
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask)
-{
- cpumask_copy(irq_data_get_effective_affinity_mask(data), mask);
-}
#else
static inline void irq_validate_effective_affinity(struct irq_data *data) { }
-static inline void irq_init_effective_affinity(struct irq_data *data,
- const struct cpumask *mask) { }
#endif
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
@@ -347,7 +339,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
return false;
cpumask_copy(desc->irq_common_data.affinity, mask);
- irq_init_effective_affinity(data, mask);
+ irq_data_update_effective_affinity(data, mask);
irqd_set(data, IRQD_AFFINITY_SET);
return true;
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index ca71123a6130..c556bc49d213 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -147,7 +147,6 @@ void suspend_device_irqs(void)
synchronize_irq(irq);
}
}
-EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc)
{
@@ -259,4 +258,3 @@ void resume_device_irqs(void)
{
resume_irqs(false);
}
-EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b156e152d6b4..714ac4c3b556 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -332,17 +332,13 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ /* nothing to do on most architectures */
}
+#endif
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
@@ -508,7 +504,7 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
-static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
bool type = static_key_type(key);
@@ -596,31 +592,6 @@ static void __jump_label_mod_update(struct static_key *key)
}
}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
-
- /* if the module doesn't have jump label entries, just return */
- if (iter_start == iter_stop)
- return;
-
- for (iter = iter_start; iter < iter_stop; iter++) {
- /* Only write NOPs for arch_branch_static(). */
- if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
- arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- }
-}
-
static int jump_label_add_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig
new file mode 100644
index 000000000000..e82f0f52ab0a
--- /dev/null
+++ b/kernel/kcsan/.kunitconfig
@@ -0,0 +1,24 @@
+# Note that the KCSAN tests need to run on an SMP setup.
+# Under kunit_tool, this can be done by using the --qemu_args
+# option to configure a machine with several cores. For example:
+# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \
+# --arch=x86_64 --qemu_args="-smp 8"
+
+CONFIG_KUNIT=y
+
+CONFIG_DEBUG_KERNEL=y
+
+# Need some level of concurrency to test a concurrency sanitizer.
+CONFIG_SMP=y
+
+CONFIG_KCSAN=y
+CONFIG_KCSAN_KUNIT_TEST=y
+
+# Set these if you want to run test_barrier_nothreads
+#CONFIG_KCSAN_STRICT=y
+#CONFIG_KCSAN_WEAK_MEMORY=y
+
+# This prevents the test from timing out on many setups. Feel free to remove
+# (or alter) this, in conjunction with setting a different test timeout with,
+# for example, the --timeout kunit_tool option.
+CONFIG_KCSAN_REPORT_ONCE_IN_MS=100
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 4d34c78334ce..acd029b307e4 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -591,11 +591,6 @@ static void kimage_free_extra_pages(struct kimage *image)
}
-int __weak machine_kexec_post_load(struct kimage *image)
-{
- return 0;
-}
-
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -1020,15 +1015,6 @@ size_t crash_get_memory_size(void)
return size;
}
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
-}
-
int crash_shrink_memory(unsigned long new_size)
{
int ret = 0;
@@ -1225,16 +1211,3 @@ int kernel_kexec(void)
mutex_unlock(&kexec_mutex);
return error;
}
-
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_kexec_protect_crashkres(void)
-{}
-
-void __weak arch_kexec_unprotect_crashkres(void)
-{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 145321a5e798..a7b411c22f19 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -29,6 +29,15 @@
#include <linux/vmalloc.h>
#include "kexec_internal.h"
+#ifdef CONFIG_KEXEC_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
+
+void set_kexec_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+#endif
+
static int kexec_calculate_store_digests(struct kimage *image);
/*
@@ -53,14 +62,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
return ret;
}
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_probe_default(image, buf, buf_len);
-}
-
-static void *kexec_image_load_default(struct kimage *image)
+void *kexec_image_load_default(struct kimage *image)
{
if (!image->fops || !image->fops->load)
return ERR_PTR(-ENOEXEC);
@@ -71,11 +73,6 @@ static void *kexec_image_load_default(struct kimage *image)
image->cmdline_buf_len);
}
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return kexec_image_load_default(image);
-}
-
int kexec_image_post_load_cleanup_default(struct kimage *image)
{
if (!image->fops || !image->fops->cleanup)
@@ -84,30 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
return image->fops->cleanup(image->image_loader_data);
}
-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
- return kexec_image_post_load_cleanup_default(image);
-}
-
-#ifdef CONFIG_KEXEC_SIG
-static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- if (!image->fops || !image->fops->verify_sig) {
- pr_debug("kernel loader does not support signature verification.\n");
- return -EKEYREJECTED;
- }
-
- return image->fops->verify_sig(buf, buf_len);
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return kexec_image_verify_sig_default(image, buf, buf_len);
-}
-#endif
-
/*
* Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
@@ -150,16 +123,44 @@ void kimage_file_post_load_cleanup(struct kimage *image)
}
#ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len)
+{
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
+}
+#endif
+
+static int kexec_image_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
static int
kimage_validate_signature(struct kimage *image)
{
int ret;
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
+ ret = kexec_image_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
if (ret) {
- if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+ if (sig_enforce) {
pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
@@ -613,19 +614,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
- * arch_kexec_locate_mem_hole - Find free memory to place the segments.
- * @kbuf: Parameters for the memory search.
- *
- * On success, kbuf->mem will have the start address of the memory region found.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
-{
- return kexec_locate_mem_hole(kbuf);
-}
-
-/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 544fd4097406..3c677918d8f2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -340,7 +340,7 @@ static int kthread(void *_create)
self = to_kthread(current);
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
@@ -398,7 +398,7 @@ static void create_kthread(struct kthread_create_info *create)
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
if (!done) {
@@ -440,9 +440,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
- * If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
+ * If I was killed by a fatal signal before kthreadd (or new
+ * kernel thread) calls complete(), leave the cleanup of this
+ * structure to that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
@@ -876,7 +876,7 @@ fail_task:
*
* Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
@@ -925,7 +925,7 @@ EXPORT_SYMBOL(kthread_create_worker);
* Return:
* The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
*/
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81e87280513e..64a13eb56078 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5238,9 +5238,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map_waits(lock, name, key, 0,
- lock->wait_type_inner,
- lock->wait_type_outer);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
@@ -5432,7 +5433,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
* be guessable and still allows some pin nesting in
* our u32 pin_count.
*/
- cookie.val = 1 + (prandom_u32() >> 16);
+ cookie.val = 1 + (sched_clock() & 0xffff);
hlock->pin_count += cookie.val;
return cookie;
}
@@ -6570,7 +6571,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9d1db4a54d34..65f0262f635e 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -335,8 +335,6 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
-
- /* Writer only, not initialized in reader */
bool handoff_set;
};
#define rwsem_first_waiter(sem) \
@@ -459,10 +457,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* to give up the lock), request a HANDOFF to
* force the issue.
*/
- if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
- time_after(jiffies, waiter->timeout)) {
- adjustment -= RWSEM_FLAG_HANDOFF;
- lockevent_inc(rwsem_rlock_handoff);
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
}
atomic_long_add(-adjustment, &sem->count);
@@ -599,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
struct rwsem_waiter *waiter)
{
- bool first = rwsem_first_waiter(sem) == waiter;
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -609,11 +609,20 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
if (has_handoff) {
- if (!first)
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
return false;
- /* First waiter inherits a previously set handoff bit */
- waiter->handoff_set = true;
+ /*
+ * First waiter can inherit a previously set handoff
+ * bit and spin on rwsem if lock acquisition fails.
+ */
+ if (waiter == first)
+ waiter->handoff_set = true;
}
new = count;
@@ -1027,6 +1036,7 @@ queue:
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list)) {
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index bc5507ab8450..ec104c2950c3 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -11,6 +11,7 @@
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
+#include <linux/mm.h>
#ifndef ARCH_SHF_SMALL
#define ARCH_SHF_SMALL 0
@@ -30,11 +31,13 @@
* to ensure complete separation of code and data, but
* only when CONFIG_STRICT_MODULE_RWX=y
*/
-#ifdef CONFIG_STRICT_MODULE_RWX
-# define strict_align(X) PAGE_ALIGN(X)
-#else
-# define strict_align(X) (X)
-#endif
+static inline unsigned int strict_align(unsigned int size)
+{
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return PAGE_ALIGN(size);
+ else
+ return size;
+}
extern struct mutex module_mutex;
extern struct list_head modules;
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 3e11523bc6f6..77e75bead569 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -137,6 +137,7 @@ void layout_symtab(struct module *mod, struct load_info *info)
info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1);
info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
mod->data_layout.size += strtab_size;
+ /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
info->core_typeoffs = mod->data_layout.size;
mod->data_layout.size += ndst * sizeof(char);
mod->data_layout.size = strict_align(mod->data_layout.size);
@@ -169,19 +170,20 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
Elf_Sym *dst;
char *s;
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ unsigned long strtab_size;
/* Set up to point into init section. */
mod->kallsyms = (void __rcu *)mod->init_layout.base +
info->mod_kallsyms_init_off;
- preempt_disable();
+ rcu_read_lock();
/* The following is safe since this pointer cannot change */
- rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
- rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
+ rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
- rcu_dereference_sched(mod->kallsyms)->strtab =
+ rcu_dereference(mod->kallsyms)->strtab =
(void *)info->sechdrs[info->index.str].sh_addr;
- rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
+ rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
/*
* Now populate the cut down core kallsyms for after init
@@ -190,22 +192,29 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs;
mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
- src = rcu_dereference_sched(mod->kallsyms)->symtab;
- for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) {
- rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
+ strtab_size = info->core_typeoffs - info->stroffs;
+ src = rcu_dereference(mod->kallsyms)->symtab;
+ for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
+ rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
if (i == 0 || is_livepatch_module(mod) ||
is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
+ ssize_t ret;
+
mod->core_kallsyms.typetab[ndst] =
- rcu_dereference_sched(mod->kallsyms)->typetab[i];
+ rcu_dereference(mod->kallsyms)->typetab[i];
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
- s += strscpy(s,
- &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name],
- KSYM_NAME_LEN) + 1;
+ ret = strscpy(s,
+ &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
+ strtab_size);
+ if (ret < 0)
+ break;
+ s += ret + 1;
+ strtab_size -= ret + 1;
}
}
- preempt_enable();
+ rcu_read_unlock();
mod->core_kallsyms.num_symtab = ndst;
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index fed58d30725d..57fc2821be63 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1988,6 +1988,13 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
/* Set up license info based on the info section */
set_license(mod, get_modinfo(info, "license"));
+ if (get_modinfo(info, "test")) {
+ if (!test_taint(TAINT_TEST))
+ pr_warn("%s: loading test module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
+ }
+
return 0;
}
@@ -2087,6 +2094,12 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->static_call_sites),
&mod->num_static_call_sites);
#endif
+#ifdef CONFIG_KUNIT
+ mod->kunit_suites = section_objs(info, ".kunit_test_suites",
+ sizeof(*mod->kunit_suites),
+ &mod->num_kunit_suites);
+#endif
+
mod->extable = section_objs(info, "__ex_table",
sizeof(*mod->extable), &mod->num_exentries);
@@ -2939,24 +2952,25 @@ static void cfi_init(struct module *mod)
{
#ifdef CONFIG_CFI_CLANG
initcall_t *init;
+#ifdef CONFIG_MODULE_UNLOAD
exitcall_t *exit;
+#endif
rcu_read_lock_sched();
mod->cfi_check = (cfi_check_fn)
find_kallsyms_symbol_value(mod, "__cfi_check");
init = (initcall_t *)
find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
- exit = (exitcall_t *)
- find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
- rcu_read_unlock_sched();
-
/* Fix init/exit functions to point to the CFI jump table */
if (init)
mod->init = *init;
#ifdef CONFIG_MODULE_UNLOAD
+ exit = (exitcall_t *)
+ find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
if (exit)
mod->exit = *exit;
#endif
+ rcu_read_unlock_sched();
cfi_module_add(mod, mod_tree.addr_min);
#endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index eec72ca962e2..b4cbb406bc28 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -179,7 +179,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- timens_on_fork(new_ns, tsk);
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index a3c758dba15a..c6eb8f8db0c0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -428,6 +428,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
[ TAINT_LIVEPATCH ] = { 'K', ' ', true },
[ TAINT_AUX ] = { 'X', ' ', true },
[ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
+ [ TAINT_TEST ] = { 'N', ' ', true },
};
/**
@@ -603,8 +604,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- printk_prefer_direct_enter();
-
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
@@ -634,8 +633,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
-
- printk_prefer_direct_exit();
}
#ifndef __WARN_FLAGS
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 6c373f2960e7..f82111837b8d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -145,7 +145,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/*
* The power returned by active_state() is expected to be
- * positive and to fit into 16 bits.
+ * positive and be in range.
*/
if (!power || power > EM_MAX_POWER) {
dev_err(dev, "EM: invalid power: %lu\n",
@@ -170,7 +170,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
goto free_ps_table;
}
} else {
- power_res = em_scale_power(table[i].power);
+ power_res = table[i].power;
cost = div64_u64(fmax * power_res, table[i].frequency);
}
@@ -201,9 +201,17 @@ static int em_create_pd(struct device *dev, int nr_states,
{
struct em_perf_domain *pd;
struct device *cpu_dev;
- int cpu, ret;
+ int cpu, ret, num_cpus;
if (_is_cpu_device(dev)) {
+ num_cpus = cpumask_weight(cpus);
+
+ /* Prevent max possible energy calculation to not overflow */
+ if (num_cpus > EM_MAX_NUM_CPUS) {
+ dev_err(dev, "EM: too many CPUs, overflow possible\n");
+ return -EINVAL;
+ }
+
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return -ENOMEM;
@@ -314,13 +322,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
* @cpus : Pointer to cpumask_t, which in case of a CPU device is
* obligatory. It can be taken from i.e. 'policy->cpus'. For other
* type of devices this should be set to NULL.
- * @milliwatts : Flag indicating that the power values are in milliWatts or
+ * @microwatts : Flag indicating that the power values are in micro-Watts or
* in some other scale. It must be set properly.
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
- * The @milliwatts is important to set with correct value. Some kernel
+ * The @microwatts is important to set with correct value. Some kernel
* sub-systems might rely on this flag and check if all devices in the EM are
* using the same scale.
*
@@ -331,7 +339,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
*/
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
- bool milliwatts)
+ bool microwatts)
{
unsigned long cap, prev_cap = 0;
unsigned long flags = 0;
@@ -381,8 +389,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
}
}
- if (milliwatts)
- flags |= EM_PERF_DOMAIN_MILLIWATTS;
+ if (microwatts)
+ flags |= EM_PERF_DOMAIN_MICROWATTS;
else if (cb->get_cost)
flags |= EM_PERF_DOMAIN_ARTIFICIAL;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 20a66bf9f465..89c71fce225d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -665,7 +665,7 @@ static void power_down(void)
hibernation_platform_enter();
fallthrough;
case HIBERNATION_SHUTDOWN:
- if (pm_power_off)
+ if (kernel_can_power_off())
kernel_power_off();
break;
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index ec7e1e85923e..af51ed6d45ef 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -531,7 +531,7 @@ int freq_qos_add_request(struct freq_constraints *qos,
{
int ret;
- if (IS_ERR_OR_NULL(qos) || !req)
+ if (IS_ERR_OR_NULL(qos) || !req || value < 0)
return -EINVAL;
if (WARN(freq_qos_request_active(req),
@@ -563,7 +563,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request);
*/
int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
{
- if (!req)
+ if (!req || new_value < 0)
return -EINVAL;
if (WARN(!freq_qos_request_active(req),
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 91fffdd2c7fb..277434b6c0bf 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -269,15 +269,14 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
- struct hib_bio_batch *hb)
+static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
+ struct hib_bio_batch *hb)
{
struct page *page = virt_to_page(addr);
struct bio *bio;
int error = 0;
- bio = bio_alloc(hib_resume_bdev, 1, op | op_flags,
- GFP_NOIO | __GFP_HIGH);
+ bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
@@ -317,8 +316,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -331,7 +329,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block, swsusp_header, NULL);
} else {
pr_err("Swap header not found!\n");
@@ -408,7 +406,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -1003,7 +1001,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1027,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1526,8 +1524,7 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(REQ_OP_READ, 0,
- swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1535,7 +1532,7 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
@@ -1586,11 +1583,11 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
+ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
swsusp_header, NULL);
} else {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ad241b4ff64c..d43c2aa583b2 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -26,6 +26,7 @@
#include "power.h"
+static bool need_wait;
static struct snapshot_data {
struct snapshot_handle handle;
@@ -78,7 +79,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
* Resuming. We may need to wait for the image device to
* appear.
*/
- wait_for_device_probe();
+ need_wait = true;
data->swap = -1;
data->mode = O_WRONLY;
@@ -168,6 +169,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
ssize_t res;
loff_t pg_offp = *offp & ~PAGE_MASK;
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
lock_system_sleep();
data = filp->private_data;
@@ -244,6 +250,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
loff_t size;
sector_t offset;
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
return -ENOTTY;
if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea3dd55709e7..a1a81fd9889b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -224,33 +224,6 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
static int nr_ext_console_drivers;
/*
- * Used to synchronize printing kthreads against direct printing via
- * console_trylock/console_unlock.
- *
- * Values:
- * -1 = console kthreads atomically blocked (via global trylock)
- * 0 = no kthread printing, console not locked (via trylock)
- * >0 = kthread(s) actively printing
- *
- * Note: For synchronizing against direct printing via
- * console_lock/console_unlock, see the @lock variable in
- * struct console.
- */
-static atomic_t console_kthreads_active = ATOMIC_INIT(0);
-
-#define console_kthreads_atomic_tryblock() \
- (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0)
-#define console_kthreads_atomic_unblock() \
- atomic_cmpxchg(&console_kthreads_active, -1, 0)
-#define console_kthreads_atomically_blocked() \
- (atomic_read(&console_kthreads_active) == -1)
-
-#define console_kthread_printing_tryenter() \
- atomic_inc_unless_negative(&console_kthreads_active)
-#define console_kthread_printing_exit() \
- atomic_dec(&console_kthreads_active)
-
-/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
*/
@@ -298,49 +271,14 @@ static bool panic_in_progress(void)
}
/*
- * Tracks whether kthread printers are all blocked. A value of true implies
- * that the console is locked via console_lock() or the console is suspended.
- * Writing to this variable requires holding @console_sem.
- */
-static bool console_kthreads_blocked;
-
-/*
- * Block all kthread printers from a schedulable context.
- *
- * Requires holding @console_sem.
- */
-static void console_kthreads_block(void)
-{
- struct console *con;
-
- for_each_console(con) {
- mutex_lock(&con->lock);
- con->blocked = true;
- mutex_unlock(&con->lock);
- }
-
- console_kthreads_blocked = true;
-}
-
-/*
- * Unblock all kthread printers from a schedulable context.
- *
- * Requires holding @console_sem.
+ * This is used for debugging the mess that is the VT code by
+ * keeping track if we have the console semaphore held. It's
+ * definitely not the perfect debug tool (we don't know if _WE_
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console semaphore held).
*/
-static void console_kthreads_unblock(void)
-{
- struct console *con;
-
- for_each_console(con) {
- mutex_lock(&con->lock);
- con->blocked = false;
- mutex_unlock(&con->lock);
- }
-
- console_kthreads_blocked = false;
-}
-
-static int console_suspended;
+static int console_locked, console_suspended;
/*
* Array of consoles built from command line options (console=)
@@ -423,75 +361,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);
-/*
- * A flag to signify if printk_activate_kthreads() has already started the
- * kthread printers. If true, any later registered consoles must start their
- * own kthread directly. The flag is write protected by the console_lock.
- */
-static bool printk_kthreads_available;
-
#ifdef CONFIG_PRINTK
-static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
-
-/**
- * printk_prefer_direct_enter - cause printk() calls to attempt direct
- * printing to all enabled consoles
- *
- * Since it is not possible to call into the console printing code from any
- * context, there is no guarantee that direct printing will occur.
- *
- * This globally effects all printk() callers.
- *
- * Context: Any context.
- */
-void printk_prefer_direct_enter(void)
-{
- atomic_inc(&printk_prefer_direct);
-}
-
-/**
- * printk_prefer_direct_exit - restore printk() behavior
- *
- * Context: Any context.
- */
-void printk_prefer_direct_exit(void)
-{
- WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
-}
-
-/*
- * Calling printk() always wakes kthread printers so that they can
- * flush the new message to their respective consoles. Also, if direct
- * printing is allowed, printk() tries to flush the messages directly.
- *
- * Direct printing is allowed in situations when the kthreads
- * are not available or the system is in a problematic state.
- *
- * See the implementation about possible races.
- */
-static inline bool allow_direct_printing(void)
-{
- /*
- * Checking kthread availability is a possible race because the
- * kthread printers can become permanently disabled during runtime.
- * However, doing that requires holding the console_lock, so any
- * pending messages will be direct printed by console_unlock().
- */
- if (!printk_kthreads_available)
- return true;
-
- /*
- * Prefer direct printing when the system is in a problematic state.
- * The context that sets this state will always see the updated value.
- * The other contexts do not care. Anyway, direct printing is just a
- * best effort. The direct output is only possible when console_lock
- * is not already taken and no kthread printers are actively printing.
- */
- return (system_state > SYSTEM_RUNNING ||
- oops_in_progress ||
- atomic_read(&printk_prefer_direct));
-}
-
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -2382,10 +2252,10 @@ asmlinkage int vprintk_emit(int facility, int level,
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
/* If called from the scheduler, we can not call up(). */
- if (!in_sched && allow_direct_printing()) {
+ if (!in_sched) {
/*
* The caller may be holding system-critical or
- * timing-sensitive locks. Disable preemption during direct
+ * timing-sensitive locks. Disable preemption during
* printing of all remaining records to all consoles so that
* this context can return as soon as possible. Hopefully
* another printk() caller will take over the printing.
@@ -2428,8 +2298,6 @@ EXPORT_SYMBOL(_printk);
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
-static void printk_start_kthread(struct console *con);
-
#else /* CONFIG_PRINTK */
#define CONSOLE_LOG_MAX 0
@@ -2463,8 +2331,6 @@ static void call_console_driver(struct console *con, const char *text, size_t le
}
static bool suppress_message_printing(int level) { return false; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
-static void printk_start_kthread(struct console *con) { }
-static bool allow_direct_printing(void) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2683,14 +2549,6 @@ static int console_cpu_notify(unsigned int cpu)
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
- else {
- /*
- * If a new CPU comes online, the conditions for
- * printer_should_wake() may have changed for some
- * kthread printer with !CON_ANYTIME.
- */
- wake_up_klogd();
- }
}
return 0;
}
@@ -2710,7 +2568,7 @@ void console_lock(void)
down_console_sem();
if (console_suspended)
return;
- console_kthreads_block();
+ console_locked = 1;
console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);
@@ -2731,30 +2589,15 @@ int console_trylock(void)
up_console_sem();
return 0;
}
- if (!console_kthreads_atomic_tryblock()) {
- up_console_sem();
- return 0;
- }
+ console_locked = 1;
console_may_schedule = 0;
return 1;
}
EXPORT_SYMBOL(console_trylock);
-/*
- * This is used to help to make sure that certain paths within the VT code are
- * running with the console lock held. It is definitely not the perfect debug
- * tool (it is not known if the VT code is the task holding the console lock),
- * but it helps tracking those weird code paths in the console code such as
- * when the console is suspended: where the console is not locked but no
- * console printing may occur.
- *
- * Note: This returns true when the console is suspended but is not locked.
- * This is intentional because the VT code must consider that situation
- * the same as if the console was locked.
- */
int is_console_locked(void)
{
- return (console_kthreads_blocked || atomic_read(&console_kthreads_active));
+ return console_locked;
}
EXPORT_SYMBOL(is_console_locked);
@@ -2777,9 +2620,18 @@ static bool abandon_console_lock_in_panic(void)
return atomic_read(&panic_cpu) != raw_smp_processor_id();
}
-static inline bool __console_is_usable(short flags)
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records.
+ *
+ * Requires the console_lock.
+ */
+static inline bool console_is_usable(struct console *con)
{
- if (!(flags & CON_ENABLED))
+ if (!(con->flags & CON_ENABLED))
+ return false;
+
+ if (!con->write)
return false;
/*
@@ -2788,43 +2640,15 @@ static inline bool __console_is_usable(short flags)
* cope (CON_ANYTIME) don't call them until this CPU is officially up.
*/
if (!cpu_online(raw_smp_processor_id()) &&
- !(flags & CON_ANYTIME))
+ !(con->flags & CON_ANYTIME))
return false;
return true;
}
-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires holding the console_lock.
- */
-static inline bool console_is_usable(struct console *con)
-{
- if (!con->write)
- return false;
-
- return __console_is_usable(con->flags);
-}
-
static void __console_unlock(void)
{
- /*
- * Depending on whether console_lock() or console_trylock() was used,
- * appropriately allow the kthread printers to continue.
- */
- if (console_kthreads_blocked)
- console_kthreads_unblock();
- else
- console_kthreads_atomic_unblock();
-
- /*
- * New records may have arrived while the console was locked.
- * Wake the kthread printers to print them.
- */
- wake_up_klogd();
-
+ console_locked = 0;
up_console_sem();
}
@@ -2842,19 +2666,17 @@ static void __console_unlock(void)
*
* @handover will be set to true if a printk waiter has taken over the
* console_lock, in which case the caller is no longer holding the
- * console_lock. Otherwise it is set to false. A NULL pointer may be provided
- * to disable allowing the console_lock to be taken over by a printk waiter.
+ * console_lock. Otherwise it is set to false.
*
* Returns false if the given console has no next record to print, otherwise
* true.
*
- * Requires the console_lock if @handover is non-NULL.
- * Requires con->lock otherwise.
+ * Requires the console_lock.
*/
-static bool __console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover)
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+ char *dropped_text, bool *handover)
{
- static atomic_t panic_console_dropped = ATOMIC_INIT(0);
+ static int panic_console_dropped;
struct printk_info info;
struct printk_record r;
unsigned long flags;
@@ -2863,8 +2685,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
- if (handover)
- *handover = false;
+ *handover = false;
if (!prb_read_valid(prb, con->seq, &r))
return false;
@@ -2872,8 +2693,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
if (con->seq != r.info->seq) {
con->dropped += r.info->seq - con->seq;
con->seq = r.info->seq;
- if (panic_in_progress() &&
- atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) {
+ if (panic_in_progress() && panic_console_dropped++ > 10) {
suppress_panic_printk = 1;
pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
}
@@ -2895,62 +2715,32 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
}
- if (handover) {
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- *
- * Interrupts are disabled because the hand over to a waiter
- * must not be interrupted until the hand over is completed
- * (@console_waiter is cleared).
- */
- printk_safe_enter_irqsave(flags);
- console_lock_spinning_enable();
-
- /* don't trace irqsoff print latency */
- stop_critical_timings();
- }
+ /*
+ * While actively printing out messages, if another printk()
+ * were to occur on another CPU, it may wait for this one to
+ * finish. This task can not be preempted if there is a
+ * waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+ stop_critical_timings(); /* don't trace print latency */
call_console_driver(con, write_text, len, dropped_text);
+ start_critical_timings();
con->seq++;
- if (handover) {
- start_critical_timings();
- *handover = console_lock_spinning_disable_and_check();
- printk_safe_exit_irqrestore(flags);
- }
+ *handover = console_lock_spinning_disable_and_check();
+ printk_safe_exit_irqrestore(flags);
skip:
return true;
}
/*
- * Print a record for a given console, but allow another printk() caller to
- * take over the console_lock and continue printing.
- *
- * Requires the console_lock, but depending on @handover after the call, the
- * caller may no longer have the console_lock.
- *
- * See __console_emit_next_record() for argument and return details.
- */
-static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover)
-{
- /*
- * Handovers are only supported if threaded printers are atomically
- * blocked. The context taking over the console_lock may be atomic.
- */
- if (!console_kthreads_atomically_blocked()) {
- *handover = false;
- handover = NULL;
- }
-
- return __console_emit_next_record(con, text, ext_text, dropped_text, handover);
-}
-
-/*
* Print out all remaining records to all consoles.
*
* @do_cond_resched is set by the caller. It can be true only in schedulable
@@ -2968,8 +2758,8 @@ static bool console_emit_next_record_transferable(struct console *con, char *tex
* were flushed to all usable consoles. A returned false informs the caller
* that everything was not flushed (either there were no usable consoles or
* another context has taken over printing or it is a panic situation and this
- * is not the panic CPU or direct printing is not preferred). Regardless the
- * reason, the caller should assume it is not useful to immediately try again.
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
*
* Requires the console_lock.
*/
@@ -2986,10 +2776,6 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
*handover = false;
do {
- /* Let the kthread printers do the work if they can. */
- if (!allow_direct_printing())
- return false;
-
any_progress = false;
for_each_console(con) {
@@ -3001,11 +2787,13 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
if (con->flags & CON_EXTENDED) {
/* Extended consoles do not print "dropped messages". */
- progress = console_emit_next_record_transferable(con, &text[0],
- &ext_text[0], NULL, handover);
+ progress = console_emit_next_record(con, &text[0],
+ &ext_text[0], NULL,
+ handover);
} else {
- progress = console_emit_next_record_transferable(con, &text[0],
- NULL, &dropped_text[0], handover);
+ progress = console_emit_next_record(con, &text[0],
+ NULL, &dropped_text[0],
+ handover);
}
if (*handover)
return false;
@@ -3120,13 +2908,10 @@ void console_unblank(void)
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
return;
- if (!console_kthreads_atomic_tryblock()) {
- up_console_sem();
- return;
- }
} else
console_lock();
+ console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
@@ -3405,10 +3190,6 @@ void register_console(struct console *newcon)
nr_ext_console_drivers++;
newcon->dropped = 0;
- newcon->thread = NULL;
- newcon->blocked = true;
- mutex_init(&newcon->lock);
-
if (newcon->flags & CON_PRINTBUFFER) {
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
@@ -3418,10 +3199,6 @@ void register_console(struct console *newcon)
/* Begin with next message. */
newcon->seq = prb_next_seq(prb);
}
-
- if (printk_kthreads_available)
- printk_start_kthread(newcon);
-
console_unlock();
console_sysfs_notify();
@@ -3448,7 +3225,6 @@ EXPORT_SYMBOL(register_console);
int unregister_console(struct console *console)
{
- struct task_struct *thd;
struct console *con;
int res;
@@ -3489,20 +3265,7 @@ int unregister_console(struct console *console)
console_drivers->flags |= CON_CONSDEV;
console->flags &= ~CON_ENABLED;
-
- /*
- * console->thread can only be cleared under the console lock. But
- * stopping the thread must be done without the console lock. The
- * task that clears @thread is the task that stops the kthread.
- */
- thd = console->thread;
- console->thread = NULL;
-
console_unlock();
-
- if (thd)
- kthread_stop(thd);
-
console_sysfs_notify();
if (console->exit)
@@ -3598,20 +3361,6 @@ static int __init printk_late_init(void)
}
late_initcall(printk_late_init);
-static int __init printk_activate_kthreads(void)
-{
- struct console *con;
-
- console_lock();
- printk_kthreads_available = true;
- for_each_console(con)
- printk_start_kthread(con);
- console_unlock();
-
- return 0;
-}
-early_initcall(printk_activate_kthreads);
-
#if defined CONFIG_PRINTK
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
@@ -3631,6 +3380,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
diff = 0;
console_lock();
+
for_each_console(c) {
if (con && con != c)
continue;
@@ -3640,11 +3390,19 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
if (printk_seq < seq)
diff += seq - printk_seq;
}
- console_unlock();
- if (diff != last_diff && reset_on_progress)
+ /*
+ * If consoles are suspended, it cannot be expected that they
+ * make forward progress, so timeout immediately. @diff is
+ * still used to return a valid flush status.
+ */
+ if (console_suspended)
+ remaining = 0;
+ else if (diff != last_diff && reset_on_progress)
remaining = timeout_ms;
+ console_unlock();
+
if (diff == 0 || remaining == 0)
break;
@@ -3686,206 +3444,11 @@ bool pr_flush(int timeout_ms, bool reset_on_progress)
}
EXPORT_SYMBOL(pr_flush);
-static void __printk_fallback_preferred_direct(void)
-{
- printk_prefer_direct_enter();
- pr_err("falling back to preferred direct printing\n");
- printk_kthreads_available = false;
-}
-
-/*
- * Enter preferred direct printing, but never exit. Mark console threads as
- * unavailable. The system is then forever in preferred direct printing and
- * any printing threads will exit.
- *
- * Must *not* be called under console_lock. Use
- * __printk_fallback_preferred_direct() if already holding console_lock.
- */
-static void printk_fallback_preferred_direct(void)
-{
- console_lock();
- __printk_fallback_preferred_direct();
- console_unlock();
-}
-
-/*
- * Print a record for a given console, not allowing another printk() caller
- * to take over. This is appropriate for contexts that do not have the
- * console_lock.
- *
- * See __console_emit_next_record() for argument and return details.
- */
-static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text)
-{
- return __console_emit_next_record(con, text, ext_text, dropped_text, NULL);
-}
-
-static bool printer_should_wake(struct console *con, u64 seq)
-{
- short flags;
-
- if (kthread_should_stop() || !printk_kthreads_available)
- return true;
-
- if (con->blocked ||
- console_kthreads_atomically_blocked()) {
- return false;
- }
-
- /*
- * This is an unsafe read from con->flags, but a false positive is
- * not a problem. Worst case it would allow the printer to wake up
- * although it is disabled. But the printer will notice that when
- * attempting to print and instead go back to sleep.
- */
- flags = data_race(READ_ONCE(con->flags));
-
- if (!__console_is_usable(flags))
- return false;
-
- return prb_read_valid(prb, seq, NULL);
-}
-
-static int printk_kthread_func(void *data)
-{
- struct console *con = data;
- char *dropped_text = NULL;
- char *ext_text = NULL;
- u64 seq = 0;
- char *text;
- int error;
-
- text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
- if (!text) {
- con_printk(KERN_ERR, con, "failed to allocate text buffer\n");
- printk_fallback_preferred_direct();
- goto out;
- }
-
- if (con->flags & CON_EXTENDED) {
- ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
- if (!ext_text) {
- con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n");
- printk_fallback_preferred_direct();
- goto out;
- }
- } else {
- dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
- if (!dropped_text) {
- con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n");
- printk_fallback_preferred_direct();
- goto out;
- }
- }
-
- con_printk(KERN_INFO, con, "printing thread started\n");
-
- for (;;) {
- /*
- * Guarantee this task is visible on the waitqueue before
- * checking the wake condition.
- *
- * The full memory barrier within set_current_state() of
- * prepare_to_wait_event() pairs with the full memory barrier
- * within wq_has_sleeper().
- *
- * This pairs with __wake_up_klogd:A.
- */
- error = wait_event_interruptible(log_wait,
- printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */
-
- if (kthread_should_stop() || !printk_kthreads_available)
- break;
-
- if (error)
- continue;
-
- error = mutex_lock_interruptible(&con->lock);
- if (error)
- continue;
-
- if (con->blocked ||
- !console_kthread_printing_tryenter()) {
- /* Another context has locked the console_lock. */
- mutex_unlock(&con->lock);
- continue;
- }
-
- /*
- * Although this context has not locked the console_lock, it
- * is known that the console_lock is not locked and it is not
- * possible for any other context to lock the console_lock.
- * Therefore it is safe to read con->flags.
- */
-
- if (!__console_is_usable(con->flags)) {
- console_kthread_printing_exit();
- mutex_unlock(&con->lock);
- continue;
- }
-
- /*
- * Even though the printk kthread is always preemptible, it is
- * still not allowed to call cond_resched() from within
- * console drivers. The task may become non-preemptible in the
- * console driver call chain. For example, vt_console_print()
- * takes a spinlock and then can call into fbcon_redraw(),
- * which can conditionally invoke cond_resched().
- */
- console_may_schedule = 0;
- console_emit_next_record(con, text, ext_text, dropped_text);
-
- seq = con->seq;
-
- console_kthread_printing_exit();
-
- mutex_unlock(&con->lock);
- }
-
- con_printk(KERN_INFO, con, "printing thread stopped\n");
-out:
- kfree(dropped_text);
- kfree(ext_text);
- kfree(text);
-
- console_lock();
- /*
- * If this kthread is being stopped by another task, con->thread will
- * already be NULL. That is fine. The important thing is that it is
- * NULL after the kthread exits.
- */
- con->thread = NULL;
- console_unlock();
-
- return 0;
-}
-
-/* Must be called under console_lock. */
-static void printk_start_kthread(struct console *con)
-{
- /*
- * Do not start a kthread if there is no write() callback. The
- * kthreads assume the write() callback exists.
- */
- if (!con->write)
- return;
-
- con->thread = kthread_run(printk_kthread_func, con,
- "pr/%s%d", con->name, con->index);
- if (IS_ERR(con->thread)) {
- con->thread = NULL;
- con_printk(KERN_ERR, con, "unable to start printing thread\n");
- __printk_fallback_preferred_direct();
- return;
- }
-}
-
/*
* Delayed printk version, for scheduler-internal messages:
*/
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_DIRECT_OUTPUT 0x02
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_OUTPUT 0x02
static DEFINE_PER_CPU(int, printk_pending);
@@ -3893,14 +3456,10 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_DIRECT_OUTPUT) {
- printk_prefer_direct_enter();
-
+ if (pending & PRINTK_PENDING_OUTPUT) {
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
-
- printk_prefer_direct_exit();
}
if (pending & PRINTK_PENDING_WAKEUP)
@@ -3925,11 +3484,10 @@ static void __wake_up_klogd(int val)
* prepare_to_wait_event(), which is called after ___wait_event() adds
* the waiter but before it has checked the wait condition.
*
- * This pairs with devkmsg_read:A, syslog_print:A, and
- * printk_kthread_func:A.
+ * This pairs with devkmsg_read:A and syslog_print:A.
*/
if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
- (val & PRINTK_PENDING_DIRECT_OUTPUT)) {
+ (val & PRINTK_PENDING_OUTPUT)) {
this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
@@ -3947,17 +3505,7 @@ void defer_console_output(void)
* New messages may have been added directly to the ringbuffer
* using vprintk_store(), so wake any waiters as well.
*/
- int val = PRINTK_PENDING_WAKEUP;
-
- /*
- * Make sure that some context will print the messages when direct
- * printing is allowed. This happens in situations when the kthreads
- * may not be as reliable or perhaps unusable.
- */
- if (allow_direct_printing())
- val |= PRINTK_PENDING_DIRECT_OUTPUT;
-
- __wake_up_klogd(val);
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
}
void printk_trigger_flush(void)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 156a99283b11..1893d909e45c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -222,7 +222,7 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
if (lock_task_sighand(task, &flags)) {
task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
if (__fatal_signal_pending(task)) {
- task->jobctl &= ~TASK_TRACED;
+ task->jobctl &= ~JOBCTL_TRACED;
wake_up_state(task, __TASK_TRACED);
}
unlock_task_sighand(task, &flags);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 1c630e573548..d471d22a5e21 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -8,6 +8,8 @@ menu "RCU Subsystem"
config TREE_RCU
bool
default y if SMP
+ # Dynticks-idle tracking
+ select CONTEXT_TRACKING_IDLE
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -262,6 +264,35 @@ config RCU_NOCB_CPU
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
+config RCU_NOCB_CPU_DEFAULT_ALL
+ bool "Offload RCU callback processing from all CPUs by default"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ Use this option to offload callback processing from all CPUs
+ by default, in the absence of the rcu_nocbs or nohz_full boot
+ parameter. This also avoids the need to use any boot parameters
+ to achieve the effect of offloading all CPUs on boot.
+
+ Say Y here if you want offload all CPUs by default on boot.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_CB_BOOST
+ bool "Offload RCU callback from real-time kthread"
+ depends on RCU_NOCB_CPU && RCU_BOOST
+ default y if PREEMPT_RT
+ help
+ Use this option to invoke offloaded callbacks as SCHED_FIFO
+ to avoid starvation by heavy SCHED_OTHER background load.
+ Of course, running as SCHED_FIFO during callback floods will
+ cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ of milliseconds or more. Therefore, when enabling this option,
+ it is your responsibility to ensure that latency-sensitive
+ tasks either run with higher priority or run on some other CPU.
+
+ Say Y here if you want to set RT priority for offloading kthreads.
+ Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
depends on RCU_EXPERT && TASKS_TRACE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 9b64e55d4f61..4da05beb13d7 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -121,7 +121,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
- depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
default n
select PREEMPT_COUNT if PREEMPT=n
help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4916077119f3..be5979da07f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,10 +12,6 @@
#include <trace/events/rcu.h>
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
-
-
/*
* Grace-period counter management.
*/
@@ -23,6 +19,9 @@
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED 0x1
+
extern int sysctl_sched_rt_runtime;
/*
@@ -120,6 +119,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
}
/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred, but do not allow the
+ * (ULONG_MAX / 2) safety-factor/guard-band.
+ */
+static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
+{
+ unsigned long cur_s = READ_ONCE(*sp);
+
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+}
+
+/*
* Has a grace period completed since the time the old gp_seq was collected?
*/
static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new)
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 277a5bfb37d4..3ef02d4a8108 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -419,6 +419,7 @@ rcu_scale_writer(void *arg)
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ current->flags |= PF_NO_SETAFFINITY;
sched_set_fifo_low(current);
if (holdoff)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7120165a9342..d8e1b270a065 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -75,62 +75,47 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
"Extend readers by disabling bh (1), irqs (2), or preempt (4)");
-torture_param(int, fqs_duration, 0,
- "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
-torture_param(int, fwd_progress_holdoff, 60,
- "Time between forward-progress tests (s)");
-torture_param(bool, fwd_progress_need_resched, 1,
- "Hide cond_resched() behind need_resched()");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
- "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
+torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
-torture_param(int, n_barrier_cbs, 0,
- "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
- "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
-torture_param(int, read_exit_delay, 13,
- "Delay between read-then-exit episodes (s)");
-torture_param(int, read_exit_burst, 16,
- "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
- "Time to wait before starting stall (s).");
-torture_param(bool, stall_no_softlockup, false,
- "Avoid softlockup warning during cpu stall.");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
-torture_param(int, stall_gp_kthread, 0,
- "Grace-period kthread stall duration (s).");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
- "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
- "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
- "Test support for tickless idle CPUs");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
@@ -209,12 +194,16 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_SYNC 6
-#define RTWS_POLL_GET 7
-#define RTWS_POLL_WAIT 8
-#define RTWS_SYNC 9
-#define RTWS_STUTTER 10
-#define RTWS_STOPPING 11
+#define RTWS_COND_GET_EXP 6
+#define RTWS_COND_SYNC 7
+#define RTWS_COND_SYNC_EXP 8
+#define RTWS_POLL_GET 9
+#define RTWS_POLL_GET_EXP 10
+#define RTWS_POLL_WAIT 11
+#define RTWS_POLL_WAIT_EXP 12
+#define RTWS_SYNC 13
+#define RTWS_STUTTER 14
+#define RTWS_STOPPING 15
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -222,9 +211,13 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_EXP",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_EXP",
"RTWS_POLL_GET",
+ "RTWS_POLL_GET_EXP",
"RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_EXP",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -337,7 +330,12 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ unsigned long (*get_gp_state_exp)(void);
+ unsigned long (*start_gp_poll_exp)(void);
+ bool (*poll_gp_state_exp)(unsigned long oldstate);
+ void (*cond_sync_exp)(unsigned long oldstate);
unsigned long (*get_gp_state)(void);
+ unsigned long (*get_gp_completed)(void);
unsigned long (*start_gp_poll)(void);
bool (*poll_gp_state)(unsigned long oldstate);
void (*cond_sync)(unsigned long oldstate);
@@ -504,9 +502,14 @@ static struct rcu_torture_ops rcu_ops = {
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.get_gp_state = get_state_synchronize_rcu,
+ .get_gp_completed = get_completed_synchronize_rcu,
.start_gp_poll = start_poll_synchronize_rcu,
.poll_gp_state = poll_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
+ .get_gp_state_exp = get_state_synchronize_rcu,
+ .start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .poll_gp_state_exp = poll_state_synchronize_rcu,
+ .cond_sync_exp = cond_synchronize_rcu_expedited,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
@@ -1136,9 +1139,8 @@ rcu_torture_fqs(void *arg)
return 0;
}
-// Used by writers to randomly choose from the available grace-period
-// primitives. The only purpose of the initialization is to size the array.
-static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC };
+// Used by writers to randomly choose from the available grace-period primitives.
+static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
static int nsynctypes;
/*
@@ -1146,18 +1148,27 @@ static int nsynctypes;
*/
static void rcu_torture_write_types(void)
{
- bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_poll1 = gp_poll, gp_sync1 = gp_sync;
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
+ bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
+ !gp_normal1 && !gp_poll1 && !gp_sync1)
+ gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
+ gp_normal1 = gp_poll1 = gp_sync1 = true;
if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
} else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
+ if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP;
+ pr_info("%s: Testing conditional expedited GPs.\n", __func__);
+ } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
+ pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1176,6 +1187,12 @@ static void rcu_torture_write_types(void)
} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
pr_alert("%s: gp_poll without primitives.\n", __func__);
}
+ if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
+ pr_info("%s: Testing polling expedited GPs.\n", __func__);
+ } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
+ pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
@@ -1254,6 +1271,10 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_completed) {
+ cookie = cur_ops->get_gp_completed();
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ }
cur_ops->readunlock(idx);
}
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1263,7 +1284,12 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
cur_ops->exp_sync();
+ cur_ops->exp_sync();
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
@@ -1274,6 +1300,14 @@ rcu_torture_writer(void *arg)
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_EXP:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP;
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
+ cur_ops->cond_sync_exp(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
gp_snap = cur_ops->start_gp_poll();
@@ -1283,9 +1317,23 @@ rcu_torture_writer(void *arg)
&rand);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET_EXP:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP;
+ gp_snap = cur_ops->start_gp_poll_exp();
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
+ while (!cur_ops->poll_gp_state_exp(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
cur_ops->sync();
+ cur_ops->sync();
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1321,8 +1369,9 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
- rcu_ftrace_dump(DUMP_ALL);
+ tracing_off();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ rcu_ftrace_dump(DUMP_ALL);
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1384,6 +1433,11 @@ rcu_torture_fakewriter(void *arg)
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync(gp_snap);
break;
+ case RTWS_COND_GET_EXP:
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp(gp_snap);
+ break;
case RTWS_POLL_GET:
gp_snap = cur_ops->start_gp_poll();
while (!cur_ops->poll_gp_state(gp_snap)) {
@@ -1391,6 +1445,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_EXP:
+ gp_snap = cur_ops->start_gp_poll_exp();
+ while (!cur_ops->poll_gp_state_exp(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_SYNC:
cur_ops->sync();
break;
@@ -1868,7 +1929,7 @@ rcu_torture_stats_print(void)
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
}
@@ -1990,7 +2051,13 @@ static void rcu_torture_mem_dump_obj(void)
static int z;
kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ if (WARN_ON_ONCE(!kcp))
+ return;
rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp)) {
+ kmem_cache_destroy(kcp);
+ return;
+ }
pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
mem_dump_obj(ZERO_SIZE_PTR);
@@ -2007,6 +2074,8 @@ static void rcu_torture_mem_dump_obj(void)
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp))
+ return;
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(kmalloc %px):", rhp);
mem_dump_obj(rhp);
@@ -2014,6 +2083,8 @@ static void rcu_torture_mem_dump_obj(void)
mem_dump_obj(&rhp->func);
kfree(rhp);
rhp = vmalloc(4096);
+ if (WARN_ON_ONCE(!rhp))
+ return;
pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(vmalloc %px):", rhp);
mem_dump_obj(rhp);
@@ -2075,6 +2146,19 @@ static int rcutorture_booster_init(unsigned int cpu)
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
@@ -2873,7 +2957,6 @@ static int rcu_torture_read_exit_child(void *trsp_in)
// Parent kthread which creates and destroys read-exit child kthreads.
static int rcu_torture_read_exit(void *unused)
{
- int count = 0;
bool errexit = false;
int i;
struct task_struct *tsp;
@@ -2885,34 +2968,28 @@ static int rcu_torture_read_exit(void *unused)
// Each pass through this loop does one read-exit episode.
do {
- if (++count > read_exit_burst) {
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
- rcu_barrier(); // Wait for task_struct free, avoid OOM.
- for (i = 0; i < read_exit_delay; i++) {
- schedule_timeout_uninterruptible(HZ);
- if (READ_ONCE(read_exit_child_stop))
- break;
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ for (i = 0; i < read_exit_burst; i++) {
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ stutter_wait("rcu_torture_read_exit");
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s", "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ break;
}
- if (!READ_ONCE(read_exit_child_stop))
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
- count = 0;
- }
- if (READ_ONCE(read_exit_child_stop))
- break;
- // Spawn child.
- tsp = kthread_run(rcu_torture_read_exit_child,
- &trs, "%s",
- "rcu_torture_read_exit_child");
- if (IS_ERR(tsp)) {
- TOROUT_ERRSTRING("out of memory");
- errexit = true;
- tsp = NULL;
- break;
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits++;
}
- cond_resched();
- kthread_stop(tsp);
- n_read_exits ++;
- stutter_wait("rcu_torture_read_exit");
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ i = 0;
+ for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+ schedule_timeout_uninterruptible(HZ);
} while (!errexit && !READ_ONCE(read_exit_child_stop));
// Clean up and exit.
@@ -3122,6 +3199,7 @@ static void rcu_test_debug_objects(void)
pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
+ kfree(rhp);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -3329,21 +3407,6 @@ rcu_torture_init(void)
rcutor_hp = firsterr;
if (torture_init_error(firsterr))
goto unwind;
-
- // Testing RCU priority boosting requires rcutorture do
- // some serious abuse. Counter this by running ksoftirqd
- // at higher priority.
- if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
- for_each_online_cpu(cpu) {
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(ksoftirqd, cpu);
- WARN_ON_ONCE(!t);
- sp.sched_priority = 2;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- }
- }
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 909644abee67..435c884c02b5 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = {
};
// Definitions for global spinlock
-static DEFINE_SPINLOCK(test_lock);
+static DEFINE_RAW_SPINLOCK(test_lock);
static void ref_lock_section(const int nloops)
{
@@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock(&test_lock);
- spin_unlock(&test_lock);
+ raw_spin_lock(&test_lock);
+ raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock(&test_lock);
+ raw_spin_lock(&test_lock);
un_delay(udl, ndl);
- spin_unlock(&test_lock);
+ raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock_irqsave(&test_lock, flags);
- spin_unlock_irqrestore(&test_lock, flags);
+ raw_spin_lock_irqsave(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
@@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock_irqsave(&test_lock, flags);
+ raw_spin_lock_irqsave(&test_lock, flags);
un_delay(udl, ndl);
- spin_unlock_irqrestore(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 50ba70f019de..1c304fec89c0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
return sum;
}
-#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
-#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
-#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
-#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
/*
* Return grace-period delay, zero if there are expedited grace
@@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
+ unsigned long gpstart;
+ unsigned long j;
unsigned long jbase = SRCU_INTERVAL;
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
jbase = 0;
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
- jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
- if (!jbase) {
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
- if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
- jbase = 1;
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+ j = jiffies - 1;
+ gpstart = READ_ONCE(ssp->srcu_gp_start);
+ if (time_after(j, gpstart))
+ jbase += j - gpstart;
+ if (!jbase) {
+ WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ jbase = 1;
+ }
}
return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
@@ -607,15 +655,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-
-/*
* Start an SRCU grace period.
*/
static void srcu_gp_start(struct srcu_struct *ssp)
@@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
- unsigned long cbdelay;
+ unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(ssp);
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = !!srcu_get_delay(ssp);
+ if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ cbdelay = 0;
+
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
+ unsigned long curdelay;
+
+ curdelay = !srcu_get_delay(ssp);
+
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(ssp) <= 0)
+ if ((--trycount + curdelay) <= 0)
return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
+ udelay(srcu_retry_check_delay);
}
}
@@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
j = jiffies;
if (READ_ONCE(ssp->reschedule_jiffies) == j) {
WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
- if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+ if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
WRITE_ONCE(ssp->reschedule_count, 1);
@@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 3925e32159b5..83c7e6620d40 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -14,7 +14,7 @@
struct rcu_tasks;
typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
-typedef void (*pregp_func_t)(void);
+typedef void (*pregp_func_t)(struct list_head *hop);
typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
typedef void (*postscan_func_t)(struct list_head *hop);
typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
@@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_work: Work queue for invoking callbacks.
* @rtp_irq_work: IRQ work queue for deferred wakeups.
* @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
* @cpu: CPU number corresponding to this entry.
* @rtpp: Pointer to the rcu_tasks structure.
*/
@@ -40,6 +41,7 @@ struct rcu_tasks_percpu {
struct work_struct rtp_work;
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
+ struct list_head rtp_blkd_tasks;
int cpu;
struct rcu_tasks *rtpp;
};
@@ -48,6 +50,7 @@ struct rcu_tasks_percpu {
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
* @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
* @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
@@ -79,6 +82,7 @@ struct rcu_tasks_percpu {
struct rcu_tasks {
struct rcuwait cbs_wait;
raw_spinlock_t cbs_gbl_lock;
+ struct mutex tasks_gp_mutex;
int gp_state;
int gp_sleep;
int init_fract;
@@ -119,6 +123,7 @@ static struct rcu_tasks rt_name = \
{ \
.cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
.cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
.gp_func = gp, \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
@@ -140,6 +145,7 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
module_param(rcu_task_ipi_delay, int, 0644);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
@@ -253,6 +259,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
@@ -323,17 +331,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
irq_work_queue(&rtpcp->rtp_irq_work);
}
-// Wait for a grace period for the specified flavor of Tasks RCU.
-static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
-{
- /* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
- "synchronize_rcu_tasks called too soon");
-
- /* Wait for the grace period. */
- wait_rcu_gp(rtp->call_func);
-}
-
// RCU callback function for rcu_barrier_tasks_generic().
static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
{
@@ -439,6 +436,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
}
+ for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
}
@@ -497,10 +499,41 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
rcu_tasks_invoke_cbs(rtp, rtpcp);
}
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
{
int needgpcb;
+
+ mutex_lock(&rtp->tasks_gp_mutex);
+
+ // If there were none, wait a bit and start over.
+ if (unlikely(midboot)) {
+ needgpcb = 0x2;
+ } else {
+ set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ rcuwait_wait_event(&rtp->cbs_wait,
+ (needgpcb = rcu_tasks_need_gpcb(rtp)),
+ TASK_IDLE);
+ }
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
+ }
+
+ // Invoke callbacks.
+ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+ mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
@@ -514,29 +547,28 @@ static int __noreturn rcu_tasks_kthread(void *arg)
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
- set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ // Wait for one grace period and invoke any callbacks
+ // that are ready.
+ rcu_tasks_one_gp(rtp, false);
- /* If there were none, wait a bit and start over. */
- rcuwait_wait_event(&rtp->cbs_wait,
- (needgpcb = rcu_tasks_need_gpcb(rtp)),
- TASK_IDLE);
-
- if (needgpcb & 0x2) {
- // Wait for one grace period.
- set_tasks_gp_state(rtp, RTGS_WAIT_GP);
- rtp->gp_start = jiffies;
- rcu_seq_start(&rtp->tasks_gp_seq);
- rtp->gp_func(rtp);
- rcu_seq_end(&rtp->tasks_gp_seq);
- }
+ // Paranoid sleep to keep this from entering a tight loop.
+ schedule_timeout_idle(rtp->gp_sleep);
+ }
+}
- /* Invoke callbacks. */
- set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
- rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+ /* Complain if the scheduler has not started. */
+ RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_rcu_tasks called too soon");
- /* Paranoid sleep to keep this from entering a tight loop */
- schedule_timeout_idle(rtp->gp_sleep);
+ // If the grace-period kthread is running, use it.
+ if (READ_ONCE(rtp->kthread_ptr)) {
+ wait_rcu_gp(rtp->call_func);
+ return;
}
+ rcu_tasks_one_gp(rtp, true);
}
/* Spawn RCU-tasks grace-period kthread. */
@@ -630,7 +662,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
struct task_struct *t;
set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
- rtp->pregp_func();
+ rtp->pregp_func(&holdouts);
/*
* There were callbacks, so we need to wait for an RCU-tasks
@@ -639,10 +671,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* and make a list of them in holdouts.
*/
set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
- rcu_read_lock();
- for_each_process_thread(g, t)
- rtp->pertask_func(t, &holdouts);
- rcu_read_unlock();
+ if (rtp->pertask_func) {
+ rcu_read_lock();
+ for_each_process_thread(g, t)
+ rtp->pertask_func(t, &holdouts);
+ rcu_read_unlock();
+ }
set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
rtp->postscan_func(&holdouts);
@@ -760,7 +794,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// disabling.
/* Pre-grace-period preparation. */
-static void rcu_tasks_pregp_step(void)
+static void rcu_tasks_pregp_step(struct list_head *hop)
{
/*
* Wait for all pre-existing t->on_rq and t->nvcsw transitions
@@ -1105,11 +1139,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// 3. Avoids expensive read-side instructions, having overhead similar
// to that of Preemptible RCU.
//
-// There are of course downsides. The grace-period code can send IPIs to
-// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace.
-// It is necessary to scan the full tasklist, much as for Tasks RCU. There
-// is a single callback queue guarded by a single lock, again, much as for
-// Tasks RCU. If needed, these downsides can be at least partially remedied.
+// There are of course downsides. For example, the grace-period code
+// can send IPIs to CPUs, even when those CPUs are in the idle loop or
+// in nohz_full userspace. If needed, these downsides can be at least
+// partially remedied.
//
// Perhaps most important, this variant of RCU does not affect the vanilla
// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
@@ -1122,38 +1155,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// invokes these functions in this order:
//
// rcu_tasks_trace_pregp_step():
-// Initialize the count of readers and block CPU-hotplug operations.
-// rcu_tasks_trace_pertask(), invoked on every non-idle task:
-// Initialize per-task state and attempt to identify an immediate
-// quiescent state for that task, or, failing that, attempt to
-// set that task's .need_qs flag so that task's next outermost
-// rcu_read_unlock_trace() will report the quiescent state (in which
-// case the count of readers is incremented). If both attempts fail,
-// the task is added to a "holdout" list. Note that IPIs are used
-// to invoke trc_read_check_handler() in the context of running tasks
-// in order to avoid ordering overhead on common-case shared-variable
-// accessses.
+// Disables CPU hotplug, adds all currently executing tasks to the
+// holdout list, then checks the state of all tasks that blocked
+// or were preempted within their current RCU Tasks Trace read-side
+// critical section, adding them to the holdout list if appropriate.
+// Finally, this function re-enables CPU hotplug.
+// The ->pertask_func() pointer is NULL, so there is no per-task processing.
// rcu_tasks_trace_postscan():
-// Initialize state and attempt to identify an immediate quiescent
-// state as above (but only for idle tasks), unblock CPU-hotplug
-// operations, and wait for an RCU grace period to avoid races with
-// tasks that are in the process of exiting.
+// Invokes synchronize_rcu() to wait for late-stage exiting tasks
+// to finish exiting.
// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
-// corresponding task is removed from the holdout list.
+// corresponding task is removed from the holdout list. Once this
+// list is empty, the grace period has completed.
// rcu_tasks_trace_postgp():
-// Wait for the count of readers do drop to zero, reporting any stalls.
-// Also execute full memory barriers to maintain ordering with code
-// executing after the grace period.
+// Provides the needed full memory barrier and does debug checks.
//
// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
//
-// Pre-grace-period update-side code is ordered before the grace
-// period via the ->cbs_lock and barriers in rcu_tasks_kthread().
-// Pre-grace-period read-side code is ordered before the grace period by
-// atomic_dec_and_test() of the count of readers (for IPIed readers) and by
-// scheduler context-switch ordering (for locked-down non-running readers).
+// Pre-grace-period update-side code is ordered before the grace period
+// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
+// read-side code is ordered before the grace period by atomic operations
+// on .b.need_qs flag of each task involved in this process, or by scheduler
+// context-switch ordering (for locked-down non-running readers).
// The lockdep state must be outside of #ifdef to be useful.
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -1165,9 +1190,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
-static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
-static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
-
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -1176,44 +1198,104 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
static unsigned long n_heavy_reader_attempts;
static unsigned long n_heavy_reader_updates;
static unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_trc_holdouts;
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
"RCU Tasks Trace");
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+ smp_mb(); // Enforce full grace-period ordering.
+ return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+ smp_store_release(&t->trc_reader_special.b.need_qs, v);
+ smp_mb(); // Enforce full grace-period ordering.
+}
+
/*
- * This irq_work handler allows rcu_read_unlock_trace() to be invoked
- * while the scheduler locks are held.
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
*/
-static void rcu_read_unlock_iw(struct irq_work *iwp)
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
- wake_up(&trc_wait);
+ union rcu_special ret;
+ union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+ union rcu_special trs_new = trs_old;
+
+ if (trs_old.b.need_qs != old)
+ return trs_old.b.need_qs;
+ trs_new.b.need_qs = new;
+ ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+ return ret.b.need_qs;
}
-static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-/* If we are the last reader, wake up the grace-period kthread. */
+/*
+ * If we are the last reader, signal the grace-period kthread.
+ * Also remove from the per-CPU list of blocked tasks.
+ */
void rcu_read_unlock_trace_special(struct task_struct *t)
{
- int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ union rcu_special trs;
+
+ // Open-coded full-word version of rcu_ld_need_qs().
+ smp_mb(); // Enforce full grace-period ordering.
+ trs = smp_load_acquire(&t->trc_reader_special);
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
- t->trc_reader_special.b.need_mb)
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
smp_mb(); // Pairs with update-side barriers.
// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
- if (nq)
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
+ if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
+ u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+ TRC_NEED_QS_CHECKED);
+
+ WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
+ }
+ if (trs.b.blocked) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->trc_blkd_node);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, false);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
WRITE_ONCE(t->trc_reader_nesting, 0);
- if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
- irq_work_queue(&rcu_tasks_trace_iw);
}
EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+
+ local_irq_save(flags);
+ rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+ t->trc_blkd_cpu = smp_processor_id();
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, true);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
/* Add a task to the holdout list, if it is not already on the list. */
static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
{
if (list_empty(&t->trc_holdout_list)) {
get_task_struct(t);
list_add(&t->trc_holdout_list, bhp);
+ n_trc_holdouts++;
}
}
@@ -1223,37 +1305,36 @@ static void trc_del_holdout(struct task_struct *t)
if (!list_empty(&t->trc_holdout_list)) {
list_del_init(&t->trc_holdout_list);
put_task_struct(t);
+ n_trc_holdouts--;
}
}
/* IPI handler to check task state. */
static void trc_read_check_handler(void *t_in)
{
+ int nesting;
struct task_struct *t = current;
struct task_struct *texp = t_in;
// If the task is no longer running on this CPU, leave.
- if (unlikely(texp != t)) {
+ if (unlikely(texp != t))
goto reset_ipi; // Already on holdout list, so will check later.
- }
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!READ_ONCE(t->trc_reader_nesting))) {
- WRITE_ONCE(t->trc_reader_checked, true);
+ nesting = READ_ONCE(t->trc_reader_nesting);
+ if (likely(!nesting)) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
+ if (unlikely(nesting < 0))
goto reset_ipi;
- WRITE_ONCE(t->trc_reader_checked, true);
- // Get here if the task is in a read-side critical section. Set
- // its state so that it will awaken the grace-period kthread upon
- // exit from that critical section.
- atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+ // Get here if the task is in a read-side critical section.
+ // Set its state so that it will update state for the grace-period
+ // kthread upon exit from that critical section.
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
@@ -1264,48 +1345,50 @@ reset_ipi:
}
/* Callback function for scheduler to check locked-down task. */
-static int trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
{
+ struct list_head *bhp = bhp_in;
int cpu = task_cpu(t);
int nesting;
bool ofl = cpu_is_offline(cpu);
- if (task_curr(t)) {
- WARN_ON_ONCE(ofl && !is_idle_task(t));
-
+ if (task_curr(t) && !ofl) {
// If no chance of heavyweight readers, do it the hard way.
- if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
return -EINVAL;
// If heavyweight readers are enabled on the remote task,
// we can inspect its state despite its currently running.
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
- if (!ofl && // Check for "running" idle tasks on offline CPUs.
- !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ // Check for "running" idle tasks on offline CPUs.
+ if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
- if (ofl)
- n_heavy_reader_ofl_updates++;
nesting = 0;
} else {
// The task is not running, so C-language access is safe.
nesting = t->trc_reader_nesting;
+ WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+ n_heavy_reader_ofl_updates++;
}
// If not exiting a read-side critical section, mark as checked
// so that the grace-period kthread will remove it from the
// holdout list.
- t->trc_reader_checked = nesting >= 0;
- if (nesting <= 0)
- return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later.
+ if (!nesting) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ return 0; // In QS, so done.
+ }
+ if (nesting < 0)
+ return -EINVAL; // Reader transitioning, try again later.
// The task is in a read-side critical section, so set up its
- // state so that it will awaken the grace-period kthread upon exit
- // from that critical section.
- atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+ // state so that it will update state upon exit from that critical
+ // section.
+ if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+ trc_add_holdout(t, bhp);
return 0;
}
@@ -1321,14 +1404,14 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
- t->trc_reader_checked = true;
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
// Attempt to nail down the task for inspection.
get_task_struct(t);
- if (!task_call_func(t, trc_inspect_reader, NULL)) {
+ if (!task_call_func(t, trc_inspect_reader, bhp)) {
put_task_struct(t);
return;
}
@@ -1366,56 +1449,93 @@ static void trc_wait_for_one_reader(struct task_struct *t,
}
}
+/*
+ * Initialize for first-round processing for the specified task.
+ * Return false if task is NULL or already taken care of, true otherwise.
+ */
+static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
+{
+ // During early boot when there is only the one boot CPU, there
+ // is no idle task for the other CPUs. Also, the grace-period
+ // kthread is always in a quiescent state. In addition, just return
+ // if this task is already on the list.
+ if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
+ return false;
+
+ rcu_st_need_qs(t, 0);
+ t->trc_ipi_to_cpu = -1;
+ return true;
+}
+
+/* Do first-round processing for the specified task. */
+static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
+{
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_wait_for_one_reader(t, hop);
+}
+
/* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(void)
+static void rcu_tasks_trace_pregp_step(struct list_head *hop)
{
+ LIST_HEAD(blkd_tasks);
int cpu;
-
- // Allow for fast-acting IPIs.
- atomic_set(&trc_n_readers_need_end, 1);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t;
// There shouldn't be any old IPIs, but...
for_each_possible_cpu(cpu)
WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
- // Disable CPU hotplug across the tasklist scan.
- // This also waits for all readers in CPU-hotplug code paths.
+ // Disable CPU hotplug across the CPU scan for the benefit of
+ // any IPIs that might be needed. This also waits for all readers
+ // in CPU-hotplug code paths.
cpus_read_lock();
-}
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t,
- struct list_head *hop)
-{
- // During early boot when there is only the one boot CPU, there
- // is no idle task for the other CPUs. Just return.
- if (unlikely(t == NULL))
- return;
+ // These rcu_tasks_trace_pertask_prep() calls are serialized to
+ // allow safe access to the hop list.
+ for_each_online_cpu(cpu) {
+ rcu_read_lock();
+ t = cpu_curr_snapshot(cpu);
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_add_holdout(t, hop);
+ rcu_read_unlock();
+ }
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_checked, false);
- t->trc_ipi_to_cpu = -1;
- trc_wait_for_one_reader(t, hop);
+ // Only after all running tasks have been accounted for is it
+ // safe to take care of the tasks that have blocked within their
+ // current RCU tasks trace read-side critical section.
+ for_each_possible_cpu(cpu) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
+ while (!list_empty(&blkd_tasks)) {
+ rcu_read_lock();
+ t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
+ list_del_init(&t->trc_blkd_node);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ rcu_tasks_trace_pertask(t, hop);
+ rcu_read_unlock();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Re-enable CPU hotplug now that the holdout list is populated.
+ cpus_read_unlock();
}
/*
- * Do intermediate processing between task and holdout scans and
- * pick up the idle tasks.
+ * Do intermediate processing between task and holdout scans.
*/
static void rcu_tasks_trace_postscan(struct list_head *hop)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- rcu_tasks_trace_pertask(idle_task(cpu), hop);
-
- // Re-enable CPU hotplug now that the tasklist scan has completed.
- cpus_read_unlock();
-
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
synchronize_rcu();
- // Any tasks that exit after this point will set ->trc_reader_checked.
+ // Any tasks that exit after this point will set
+ // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
}
/* Communicate task state back to the RCU tasks trace stall warning request. */
@@ -1429,11 +1549,11 @@ static int trc_check_slow_task(struct task_struct *t, void *arg)
{
struct trc_stall_chk_rdr *trc_rdrp = arg;
- if (task_curr(t))
+ if (task_curr(t) && cpu_online(task_cpu(t)))
return false; // It is running, so decline to inspect it.
trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
- trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
+ trc_rdrp->needqs = rcu_ld_need_qs(t);
return true;
}
@@ -1450,18 +1570,21 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
}
cpu = task_cpu(t);
if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
- pr_alert("P%d: %c\n",
+ pr_alert("P%d: %c%c\n",
t->pid,
+ ".I"[t->trc_ipi_to_cpu >= 0],
".i"[is_idle_tsk]);
else
- pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
+ pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
t->pid,
".I"[trc_rdr.ipi_to_cpu >= 0],
".i"[is_idle_tsk],
".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ ".B"[!!data_race(t->trc_reader_special.b.blocked)],
trc_rdr.nesting,
- " N"[!!trc_rdr.needqs],
- cpu);
+ " !CN"[trc_rdr.needqs & 0x3],
+ " ?"[trc_rdr.needqs > 0x3],
+ cpu, cpu_online(cpu) ? "" : "(offline)");
sched_show_task(t);
}
@@ -1481,18 +1604,18 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
{
struct task_struct *g, *t;
- // Disable CPU hotplug across the holdout list scan.
+ // Disable CPU hotplug across the holdout list scan for IPIs.
cpus_read_lock();
list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
// If safe and needed, try to check the current task.
if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
- !READ_ONCE(t->trc_reader_checked))
+ !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
- READ_ONCE(t->trc_reader_checked))
+ rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
@@ -1516,10 +1639,6 @@ static void rcu_tasks_trace_empty_fn(void *unused)
static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
{
int cpu;
- bool firstreport;
- struct task_struct *g, *t;
- LIST_HEAD(holdouts);
- long ret;
// Wait for any lingering IPI handlers to complete. Note that
// if a CPU has gone offline or transitioned to userspace in the
@@ -1530,37 +1649,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
- // Remove the safety count.
- smp_mb__before_atomic(); // Order vs. earlier atomics
- atomic_dec(&trc_n_readers_need_end);
- smp_mb__after_atomic(); // Order vs. later atomics
-
- // Wait for readers.
- set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
- for (;;) {
- ret = wait_event_idle_exclusive_timeout(
- trc_wait,
- atomic_read(&trc_n_readers_need_end) == 0,
- READ_ONCE(rcu_task_stall_timeout));
- if (ret)
- break; // Count reached zero.
- // Stall warning time, so make a list of the offenders.
- rcu_read_lock();
- for_each_process_thread(g, t)
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- trc_add_holdout(t, &holdouts);
- rcu_read_unlock();
- firstreport = true;
- list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- show_stalled_task_trace(t, &firstreport);
- trc_del_holdout(t); // Release task_struct reference.
- }
- if (firstreport)
- pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
- show_stalled_ipi_trace();
- pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
- }
smp_mb(); // Caller's code must be ordered after wakeup.
// Pairs with pretty much every ordering primitive.
}
@@ -1568,11 +1656,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
/* Report any needed quiescent state for this exiting task. */
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
- WRITE_ONCE(t->trc_reader_checked, true);
+ union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
- WRITE_ONCE(t->trc_reader_nesting, 0);
- if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
+ if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
rcu_read_unlock_trace_special(t);
+ else
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
/**
@@ -1646,7 +1737,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
rcu_tasks_trace.init_fract = 1;
}
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
- rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
@@ -1659,7 +1749,8 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
+ sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
data_race(n_heavy_reader_attempts));
@@ -1686,23 +1777,24 @@ struct rcu_tasks_test_desc {
struct rcu_head rh;
const char *name;
bool notrun;
+ unsigned long runstart;
};
static struct rcu_tasks_test_desc tests[] = {
{
.name = "call_rcu_tasks()",
/* If not defined, the test is skipped. */
- .notrun = !IS_ENABLED(CONFIG_TASKS_RCU),
+ .notrun = IS_ENABLED(CONFIG_TASKS_RCU),
},
{
.name = "call_rcu_tasks_rude()",
/* If not defined, the test is skipped. */
- .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+ .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
},
{
.name = "call_rcu_tasks_trace()",
/* If not defined, the test is skipped. */
- .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
}
};
@@ -1713,46 +1805,85 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
pr_info("Callback from %s invoked.\n", rttd->name);
- rttd->notrun = true;
+ rttd->notrun = false;
}
static void rcu_tasks_initiate_self_tests(void)
{
+ unsigned long j = jiffies;
+
pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
+ tests[0].runstart = j;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
+ tests[1].runstart = j;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
+ tests[2].runstart = j;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
#endif
}
+/*
+ * Return: 0 - test passed
+ * 1 - test failed, but have not timed out yet
+ * -1 - test failed and timed out
+ */
static int rcu_tasks_verify_self_tests(void)
{
int ret = 0;
int i;
+ unsigned long bst = rcu_task_stall_timeout;
+ if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
+ bst = RCU_TASK_BOOT_STALL_TIMEOUT;
for (i = 0; i < ARRAY_SIZE(tests); i++) {
- if (!tests[i].notrun) { // still hanging.
- pr_err("%s has been failed.\n", tests[i].name);
- ret = -1;
+ while (tests[i].notrun) { // still hanging.
+ if (time_after(jiffies, tests[i].runstart + bst)) {
+ pr_err("%s has failed boot-time tests.\n", tests[i].name);
+ ret = -1;
+ break;
+ }
+ ret = 1;
+ break;
}
}
-
- if (ret)
- WARN_ON(1);
+ WARN_ON(ret < 0);
return ret;
}
-late_initcall(rcu_tasks_verify_self_tests);
+
+/*
+ * Repeat the rcu_tasks_verify_self_tests() call once every second until the
+ * test passes or has timed out.
+ */
+static struct delayed_work rcu_tasks_verify_work;
+static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
+{
+ int ret = rcu_tasks_verify_self_tests();
+
+ if (ret <= 0)
+ return;
+
+ /* Test fails but not timed out yet, reschedule another check */
+ schedule_delayed_work(&rcu_tasks_verify_work, HZ);
+}
+
+static int rcu_tasks_verify_schedule_work(void)
+{
+ INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
+ rcu_tasks_verify_work_fn(NULL);
+ return 0;
+}
+late_initcall(rcu_tasks_verify_schedule_work);
#else /* #ifdef CONFIG_PROVE_RCU */
static void rcu_tasks_initiate_self_tests(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 340b3f8b090d..f0561ee16b9c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -58,7 +58,7 @@ void rcu_qs(void)
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
raise_softirq_irqoff(RCU_SOFTIRQ);
}
- WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1);
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_rcu() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_rcu() is a quiescent
- * state, and so on a UP system, synchronize_rcu() need do nothing.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent state,
+ * and so on a UP system, synchronize_rcu() need do nothing, other than
+ * let the polled APIs know that another grace period elapsed.
+ *
* (But Lai Jiangshan points out the benefits of doing might_sleep()
* to reduce latency.)
*
@@ -152,6 +154,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -213,10 +216,24 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
- return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+ return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ if (head) {
+ void *ptr = (void *) head - (unsigned long) func;
+
+ kasan_record_aux_stack_noalloc(ptr);
+ }
+
+ __kvfree_call_rcu(head, func);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c25ba442044a..3dc968006ad0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -62,6 +62,7 @@
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kasan.h>
+#include <linux/context_tracking.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -75,9 +76,6 @@
/* Data structures. */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
- .dynticks_nesting = 1,
- .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(1),
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_RCU_CORE,
#endif
@@ -154,7 +152,11 @@ static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
-/* rcuc/rcub/rcuop kthread realtime priority */
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -263,56 +265,6 @@ void rcu_softirq_qs(void)
}
/*
- * Increment the current CPU's rcu_data structure's ->dynticks field
- * with ordering. Return the new value.
- */
-static noinline noinstr unsigned long rcu_dynticks_inc(int incby)
-{
- return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks));
-}
-
-/*
- * Record entry into an extended quiescent state. This is only to be
- * called when not already in an extended quiescent state, that is,
- * RCU is watching prior to the call to this function and is no longer
- * watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_enter(void)
-{
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior RCU read-side
- * critical sections, and we also must force ordering with the
- * next idle sojourn.
- */
- rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = rcu_dynticks_inc(1);
- // RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
-}
-
-/*
- * Record exit from an extended quiescent state. This is only to be
- * called from an extended quiescent state, that is, RCU is not watching
- * prior to the call to this function and is watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_exit(void)
-{
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior idle sojourns,
- * and we also must force ordering with the next RCU read-side
- * critical section.
- */
- seq = rcu_dynticks_inc(1);
- // RCU is now watching. Better not be in an extended quiescent state!
- rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
-}
-
-/*
* Reset the current CPU's ->dynticks counter to indicate that the
* newly onlined CPU is no longer in an extended quiescent state.
* This will either leave the counter unchanged, or increment it
@@ -324,31 +276,19 @@ static noinstr void rcu_dynticks_eqs_exit(void)
*/
static void rcu_dynticks_eqs_online(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- if (atomic_read(&rdp->dynticks) & 0x1)
+ if (ct_dynticks() & RCU_DYNTICKS_IDX)
return;
- rcu_dynticks_inc(1);
-}
-
-/*
- * Is the current CPU in an extended quiescent state?
- *
- * No ordering, as we are sampling CPU-local information.
- */
-static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
-{
- return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
+ ct_state_inc(RCU_DYNTICKS_IDX);
}
/*
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
-static int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(int cpu)
{
smp_mb(); // Fundamental RCU ordering guarantee.
- return atomic_read_acquire(&rdp->dynticks);
+ return ct_dynticks_cpu_acquire(cpu);
}
/*
@@ -357,15 +297,13 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & 0x1);
+ return !(snap & RCU_DYNTICKS_IDX);
}
/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
bool rcu_is_idle_cpu(int cpu)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
- return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+ return rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
}
/*
@@ -375,7 +313,7 @@ bool rcu_is_idle_cpu(int cpu)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp);
+ return snap != rcu_dynticks_snap(rdp->cpu);
}
/*
@@ -384,19 +322,17 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
*/
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~0x1;
-
+ snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == atomic_read(&rdp->dynticks);
+ return snap == ct_dynticks_cpu(cpu);
}
/*
@@ -415,9 +351,9 @@ notrace void rcu_momentary_dyntick_idle(void)
int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- seq = rcu_dynticks_inc(2);
+ seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(seq & 0x1));
+ WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -442,13 +378,13 @@ static int rcu_is_cpu_rrupt_from_idle(void)
lockdep_assert_irqs_disabled();
/* Check for counter underflows */
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
"RCU dynticks_nesting counter underflow!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
"RCU dynticks_nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
+ nesting = ct_dynticks_nmi_nesting();
if (nesting > 1)
return false;
@@ -458,7 +394,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
WARN_ON_ONCE(!nesting && !is_idle_task(current));
/* Does CPU appear to be idle from an RCU standpoint? */
- return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
+ return ct_dynticks_nesting() == 0;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -609,66 +545,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
-/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- */
-static noinstr void rcu_eqs_enter(bool user)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdp->dynticks_nesting == 0);
- if (rdp->dynticks_nesting != 1) {
- // RCU will still be watching, so just do accounting and leave.
- rdp->dynticks_nesting--;
- return;
- }
-
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- rcu_preempt_deferred_qs(current);
-
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * If you add or remove a call to rcu_idle_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_eqs_enter(false);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-#ifdef CONFIG_NO_HZ_FULL
-
-#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
* An empty function that will trigger a reschedule on
* IRQ tail once IRQs get re-enabled on userspace/guest resume.
@@ -690,7 +567,7 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
* last resort is to fire a local irq_work that will trigger a reschedule once IRQs
* get re-enabled again.
*/
-noinstr static void rcu_irq_work_resched(void)
+noinstr void rcu_irq_work_resched(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -706,114 +583,7 @@ noinstr static void rcu_irq_work_resched(void)
}
instrumentation_end();
}
-
-#else
-static inline void rcu_irq_work_resched(void) { }
-#endif
-
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace. No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- *
- * If you add or remove a call to rcu_user_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_user_enter(void)
-{
- lockdep_assert_irqs_disabled();
-
- /*
- * Other than generic entry implementation, we may be past the last
- * rescheduling opportunity in the entry code. Trigger a self IPI
- * that will fire and reschedule once we resume in user/guest mode.
- */
- rcu_irq_work_resched();
- rcu_eqs_enter(true);
-}
-
-#endif /* CONFIG_NO_HZ_FULL */
-
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
- *
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
- *
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_exit(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- instrumentation_begin();
- /*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
- */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
-
- /*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
- */
- if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
- atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
- rdp->dynticks_nmi_nesting - 2);
- instrumentation_end();
- return;
- }
-
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
-
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
- instrumentation_end();
-
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
-
- if (!in_nmi())
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture's idle loop violates this assumption, RCU will give you what
- * you deserve, good and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_irq_exit(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-}
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
#ifdef CONFIG_PROVE_RCU
/**
@@ -823,9 +593,9 @@ void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
"RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
DYNTICK_IRQ_NONIDLE,
"Bad RCU dynticks_nmi_nesting counter\n");
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -833,95 +603,8 @@ void rcu_irq_exit_check_preempt(void)
}
#endif /* #ifdef CONFIG_PROVE_RCU */
-/*
- * Wrapper for rcu_irq_exit() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_exit_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_exit();
- local_irq_restore(flags);
-}
-
-/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
- * allow for the possibility of usermode upcalls messing up our count of
- * interrupt nesting level during the busy period that is just now starting.
- */
-static void noinstr rcu_eqs_exit(bool user)
-{
- struct rcu_data *rdp;
- long oldval;
-
- lockdep_assert_irqs_disabled();
- rdp = this_cpu_ptr(&rcu_data);
- oldval = rdp->dynticks_nesting;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
- if (oldval) {
- // RCU was already watching, so just do accounting and leave.
- rdp->dynticks_nesting++;
- return;
- }
- rcu_dynticks_task_exit();
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
- instrumentation_begin();
-
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- WRITE_ONCE(rdp->dynticks_nesting, 1);
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
- instrumentation_end();
-}
-
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * If you add or remove a call to rcu_idle_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_exit(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_eqs_exit(false);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
#ifdef CONFIG_NO_HZ_FULL
/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- *
- * If you add or remove a call to rcu_user_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_user_exit(void)
-{
- rcu_eqs_exit(true);
-}
-
-/**
* __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
*
* The scheduler tick is not normally enabled when CPUs enter the kernel
@@ -983,109 +666,6 @@ void __rcu_irq_enter_check_tick(void)
}
#endif /* CONFIG_NO_HZ_FULL */
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- *
- * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
- * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * that the CPU is active. This implementation permits nested NMIs, as
- * long as the nesting level does not overflow an int. (You will probably
- * run out of stack space first.)
- *
- * If you add or remove a call to rcu_nmi_enter(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_enter(void)
-{
- long incby = 2;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- /* Complain about underflow. */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
-
- /*
- * If idle from RCU viewpoint, atomically increment ->dynticks
- * to mark non-idle and increment ->dynticks_nmi_nesting by one.
- * Otherwise, increment ->dynticks_nmi_nesting by two. This means
- * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
- * to be in the outermost NMI handler that interrupted an RCU-idle
- * period (observation due to Andy Lutomirski).
- */
- if (rcu_dynticks_curr_cpu_in_eqs()) {
-
- if (!in_nmi())
- rcu_dynticks_task_exit();
-
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
-
- instrumentation_begin();
- // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
- instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- incby = 1;
- } else if (!in_nmi()) {
- instrumentation_begin();
- rcu_irq_enter_check_tick();
- } else {
- instrumentation_begin();
- }
-
- trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
- rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
- rdp->dynticks_nmi_nesting + incby);
- barrier();
-}
-
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to user mode!
- * This code assumes that the idle loop never does upcalls to user mode.
- * If your architecture's idle loop does do upcalls to user mode (or does
- * anything else that results in unbalanced calls to the irq_enter() and
- * irq_exit() functions), RCU will give you what you deserve, good and hard.
- * But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_irq_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_enter();
-}
-
-/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_enter_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_enter();
- local_irq_restore(flags);
-}
-
/*
* Check to see if any future non-offloaded RCU-related work will need
* to be done by the current CPU, even if none need be done immediately,
@@ -1223,7 +803,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1775,6 +1355,79 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked())
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If RCU was idle, note beginning of GP.
+ if (!rcu_seq_state(rcu_state.gp_seq_polled))
+ rcu_seq_start(&rcu_state.gp_seq_polled);
+
+ // Either way, record current state.
+ *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked())
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If the previously noted GP is still in effect, record the
+ // end of that GP. Either way, zero counter to avoid counter-wrap
+ // problems.
+ if (*snap && *snap == rcu_state.gp_seq_polled) {
+ rcu_seq_end(&rcu_state.gp_seq_polled);
+ rcu_state.gp_seq_polled_snap = 0;
+ rcu_state.gp_seq_polled_exp_snap = 0;
+ } else {
+ *snap = 0;
+ }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irq_rcu_node(rnp);
+ }
+ rcu_poll_gp_seq_start(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irq_rcu_node(rnp);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irq_rcu_node(rnp);
+ }
+ rcu_poll_gp_seq_end(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irq_rcu_node(rnp);
+}
+
/*
* Initialize a new grace period. Return false if no grace period required.
*/
@@ -1810,6 +1463,7 @@ static noinline_for_stack bool rcu_gp_init(void)
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+ rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -1971,19 +1625,23 @@ static void rcu_gp_fqs(bool first_time)
*/
static noinline_for_stack void rcu_gp_fqs_loop(void)
{
- bool first_gp_fqs;
+ bool first_gp_fqs = true;
int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
- first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
if (rcu_state.cbovld)
gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
- if (!ret) {
+ if (rcu_state.cbovld) {
+ j = (j + 2) / 3;
+ if (j <= 0)
+ j = 1;
+ }
+ if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
/*
* jiffies_force_qs before RCU_GP_WAIT_FQS state
@@ -2001,7 +1659,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
rcu_gp_torture_wait();
WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
- /* If grace period done, leave loop. */
+ /*
+ * Exit the loop if the root rcu_node structure indicates that the grace period
+ * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
+ * is required only for single-node rcu_node trees because readers blocking
+ * the current grace period are queued only on leaf rcu_node structures.
+ * For multi-node trees, checking the root node's ->qsmask suffices, because a
+ * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+ * the corresponding leaf nodes have passed through their quiescent state.
+ */
if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
@@ -2069,6 +1735,7 @@ static noinline void rcu_gp_cleanup(void)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
+ rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -2530,7 +2197,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
+ rcu_is_callbacks_kthread(rdp));
return;
}
@@ -2608,7 +2275,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
- is_idle_task(current), rcu_is_callbacks_kthread());
+ is_idle_task(current), rcu_is_callbacks_kthread(rdp));
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
@@ -3211,7 +2878,6 @@ struct kfree_rcu_cpu_work {
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @rcu_work fields have been initialized
* @count: Number of objects for which GP not started
* @bkvcache:
@@ -3236,7 +2902,6 @@ struct kfree_rcu_cpu {
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
- bool monitor_todo;
bool initialized;
int count;
@@ -3416,6 +3081,18 @@ static void kfree_rcu_work(struct work_struct *work)
}
}
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (krcp->bkvhead[i])
+ return true;
+
+ return !!krcp->head;
+}
+
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@@ -3472,9 +3149,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// of the channels that is still busy we should rearm the
// work to repeat an attempt. Because previous batches are
// still in progress.
- if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
- krcp->monitor_todo = false;
- else
+ if (need_offload_krc(krcp))
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
@@ -3662,11 +3337,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
WRITE_ONCE(krcp->count, krcp->count + 1);
// Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !krcp->monitor_todo) {
- krcp->monitor_todo = true;
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- }
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -3741,14 +3413,8 @@ void __init kfree_rcu_scheduler_running(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
- if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
- krcp->monitor_todo) {
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- continue;
- }
- krcp->monitor_todo = true;
- schedule_delayed_work_on(cpu, &krcp->monitor_work,
- KFREE_DRAIN_JIFFIES);
+ if (need_offload_krc(krcp))
+ schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3837,8 +3503,18 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with
+ // other normal GPs only by being fully nested within them,
+ // which allows reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+ if (rcu_init_invoked())
+ cond_resched_tasks_rcu_qs();
return; // Context allows vacuous grace periods.
+ }
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
@@ -3860,7 +3536,7 @@ unsigned long get_state_synchronize_rcu(void)
* before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
- return rcu_seq_snap(&rcu_state.gp_seq);
+ return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
@@ -3889,7 +3565,13 @@ unsigned long start_poll_synchronize_rcu(void)
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); // irqs already disabled.
- needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+ // Note it is possible for a grace period to have elapsed between
+ // the above call to get_state_synchronize_rcu() and the below call
+ // to rcu_seq_snap. This is OK, the worst that happens is that we
+ // get a grace period that no one needed. These accesses are ordered
+ // by smp_mb(), and we are accessing them in the opposite order
+ // from which they are updated at grace-period start, as required.
+ needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake();
@@ -3911,7 +3593,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!).
+ * more than a billion grace periods (and way more on a 64-bit system!).
* Those needing to keep oldstate values for very long time periods
* (many hours even on 32-bit systems) should check them occasionally
* and either refresh them or set a flag indicating that the grace period
@@ -3924,7 +3606,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
- if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
+ if (oldstate == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
smp_mb(); /* Ensure GP ends before subsequent accesses. */
return true;
}
@@ -3935,20 +3618,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
* cond_synchronize_rcu - Conditionally wait for an RCU grace period
*
- * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
* Otherwise, invoke synchronize_rcu() to wait for a full grace period.
*
- * Yes, this function does not take counter wrap into account. But
- * counter wrap is harmless. If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @oldstate, and that returned at the end
+ * to the function that provided @oldstate and that returned at the end
* of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
@@ -4221,13 +3904,14 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
- WARN_ON_ONCE(rdp->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+ WARN_ON_ONCE(ct->dynticks_nesting != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
@@ -4251,6 +3935,7 @@ rcu_boot_init_percpu_data(int cpu)
int rcutree_prepare_cpu(unsigned int cpu)
{
unsigned long flags;
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rcu_get_root();
@@ -4259,7 +3944,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
+ ct->dynticks_nesting = 1; /* CPU not up, no tearing. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -4441,6 +4126,7 @@ void rcu_report_dead(unsigned int cpu)
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
@@ -4486,6 +4172,7 @@ void rcutree_migrate_callbacks(int cpu)
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ check_cb_ovld_locked(my_rdp, my_rnp);
if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4701,6 +4388,9 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
mutex_init(&rnp->boost_kthread_mutex);
+ raw_spin_lock_init(&rnp->exp_poll_lock);
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
@@ -4926,6 +4616,10 @@ void __init rcu_init(void)
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
else
qovld_calc = qovld;
+
+ // Kick-start any polled grace periods that started early.
+ if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
+ (void)start_poll_synchronize_rcu_expedited();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2ccf5845957d..d4a97e40ea9c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -133,6 +133,10 @@ struct rcu_node {
wait_queue_head_t exp_wq[4];
struct rcu_exp_work rew;
bool exp_need_flush; /* Need to flush workitem? */
+ raw_spinlock_t exp_poll_lock;
+ /* Lock and data for polled expedited grace periods. */
+ unsigned long exp_seq_poll_rq;
+ struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
@@ -187,9 +191,6 @@ struct rcu_data {
/* 3) dynticks interface. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- long dynticks_nesting; /* Track process nesting level. */
- long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
@@ -235,6 +236,7 @@ struct rcu_data {
* if rdp_gp.
*/
struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+ struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -323,6 +325,9 @@ struct rcu_state {
short gp_state; /* GP kthread sleep state. */
unsigned long gp_wake_time; /* Last GP kthread wake. */
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
+ unsigned long gp_seq_polled; /* GP seq for polled API. */
+ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
+ unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
@@ -425,12 +430,11 @@ static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static bool rcu_is_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
-static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
@@ -470,10 +474,6 @@ do { \
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
-static void rcu_dynticks_task_enter(void);
-static void rcu_dynticks_task_exit(void);
-static void rcu_dynticks_task_trace_enter(void);
-static void rcu_dynticks_task_trace_exit(void);
/* Forward declarations for tree_stall.h */
static void record_gp_stall_check_time(void);
@@ -481,3 +481,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
static void check_cpu_stall(struct rcu_data *rdp);
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0f70f62039a9..be667583a554 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_exp_gp_seq_start(void)
{
rcu_seq_start(&rcu_state.expedited_sequence);
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
}
/*
@@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
*/
static void rcu_exp_gp_seq_end(void)
{
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_seq_end(&rcu_state.expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
@@ -356,7 +358,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(rdp);
+ snap = rcu_dynticks_snap(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -621,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void)
return;
if (rcu_stall_is_suppressed())
continue;
- panic_on_rcu_stall();
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
@@ -636,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void)
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c", cpu,
+ pr_cont(" %d-%c%c%c%c", cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
@@ -669,6 +671,7 @@ static void synchronize_rcu_expedited_wait(void)
}
}
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+ panic_on_rcu_stall();
}
}
@@ -913,8 +916,18 @@ void synchronize_rcu_expedited(void)
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
/* Is the state is such that the call is a grace period? */
- if (rcu_blocking_is_gp())
- return;
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs
+ // at process level. Therefore, this expedited GP overlaps
+ // with other expedited GPs only by being fully nested within
+ // them, which allows reuse of ->gp_seq_polled_exp_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ if (rcu_init_invoked())
+ cond_resched();
+ return; // Context allows vacuous grace periods.
+ }
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
@@ -950,3 +963,93 @@ void synchronize_rcu_expedited(void)
synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+ unsigned long flags;
+ int i = 0;
+ struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+ unsigned long s;
+
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+ if (s == RCU_GET_STATE_COMPLETED)
+ return;
+ while (!poll_state_synchronize_rcu(s)) {
+ synchronize_rcu_expedited();
+ if (i == 10 || i == 20)
+ pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
+ i++;
+ }
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ if (poll_state_synchronize_rcu(s))
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime. If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ s = get_state_synchronize_rcu();
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rdp->mynode;
+ if (rcu_init_invoked())
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ if (!poll_state_synchronize_rcu(s)) {
+ rnp->exp_seq_poll_rq = s;
+ if (rcu_init_invoked())
+ queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+ return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 46694e13398a..a8f574d8850d 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -546,52 +546,51 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
}
}
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- * rdp_offload_toggle() nocb_gp_enabled_cb()
- * ------------------------- ----------------------------
- * WRITE flags LOCK nocb_gp_lock
- * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
- * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
- * UNLOCK nocb_gp_lock READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
- bool *needwake_state)
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
+ bool *wake_state)
{
struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- }
- return false;
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Offloading. Set our flag and notify the offload worker.
+ * We will handle this rdp until it ever gets de-offloaded.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 1;
+ } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 0;
+ } else {
+ WARN_ON_ONCE(1);
+ ret = -1;
}
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We will ignore this rdp until it ever gets re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- return true;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return ret;
}
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
/*
* No-CBs GP kthreads come here to wait for additional callbacks to show up
@@ -609,7 +608,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
bool needwake_gp;
- struct rcu_data *rdp;
+ struct rcu_data *rdp, *rdp_toggling = NULL;
struct rcu_node *rnp;
unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
bool wasempty = false;
@@ -634,19 +633,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
* is added to the list, so the skipped-over rcu_data structures
* won't be ignored for long.
*/
- list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
- bool needwake_state = false;
-
- if (!nocb_gp_enabled_cb(rdp))
- continue;
+ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
- if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue;
- }
+ lockdep_assert_held(&rdp->nocb_lock);
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
if (bypass_ncbs &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@@ -656,8 +646,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
continue; /* No callbacks here, try next. */
}
if (bypass_ncbs) {
@@ -705,8 +693,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (needwake_gp)
rcu_gp_kthread_wake();
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
}
my_rdp->nocb_gp_bypass = bypass;
@@ -723,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
/* Polling, so trace if first poll in the series. */
if (gotcbs)
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
+ if (list_empty(&my_rdp->nocb_head_rdp)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (!my_rdp->nocb_toggling_rdp)
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ /* Wait for any offloading rdp */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ schedule_timeout_idle(1);
+ }
} else if (!needwait_gp) {
/* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ nocb_gp_sleep(my_rdp, cpu);
} else {
rnp = my_rdp->mynode;
trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
@@ -739,15 +731,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
!READ_ONCE(my_rdp->nocb_gp_sleep));
trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
}
+
if (!rcu_nocb_poll) {
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ // (De-)queue an rdp to/from the group if its nocb state is changing
+ rdp_toggling = my_rdp->nocb_toggling_rdp;
+ if (rdp_toggling)
+ my_rdp->nocb_toggling_rdp = NULL;
+
if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
del_timer(&my_rdp->nocb_timer);
}
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ } else {
+ rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+ if (rdp_toggling) {
+ /*
+ * Paranoid locking to make sure nocb_toggling_rdp is well
+ * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+ * race with another round of nocb toggling for this rdp.
+ * Nocb locking should prevent from that already but we stick
+ * to paranoia, especially in rare path.
+ */
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ my_rdp->nocb_toggling_rdp = NULL;
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ }
+
+ if (rdp_toggling) {
+ bool wake_state = false;
+ int ret;
+
+ ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ if (ret == 1)
+ list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
+ else if (ret == 0)
+ list_del(&rdp_toggling->nocb_entry_rdp);
+ if (wake_state)
+ swake_up_one(&rdp_toggling->nocb_state_wq);
}
+
my_rdp->nocb_gp_seq = -1;
WARN_ON(signal_pending(current));
}
@@ -966,16 +992,15 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
swake_up_one(&rdp->nocb_cb_wq);
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ // Queue this rdp for add/del to/from the list to iterate on rcuog
+ WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
if (rdp_gp->nocb_gp_sleep) {
rdp_gp->nocb_gp_sleep = false;
wake_gp = true;
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return 0;
+ return wake_gp;
}
static long rcu_nocb_rdp_deoffload(void *arg)
@@ -983,9 +1008,15 @@ static long rcu_nocb_rdp_deoffload(void *arg)
struct rcu_data *rdp = arg;
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int ret;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * rcu_nocb_rdp_deoffload() may be called directly if
+ * rcuog/o[p] spawn failed, because at this time the rdp->cpu
+ * is not online yet.
+ */
+ WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
pr_info("De-offloading %d\n", rdp->cpu);
@@ -1009,12 +1040,41 @@ static long rcu_nocb_rdp_deoffload(void *arg)
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
invoke_rcu_core();
- ret = rdp_offload_toggle(rdp, false, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
- SEGCBLIST_KTHREAD_GP));
- /* Stop nocb_gp_wait() from iterating over this structure. */
- list_del_rcu(&rdp->nocb_entry_rdp);
+ wake_gp = rdp_offload_toggle(rdp, false, flags);
+
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (rdp_gp->nocb_gp_kthread) {
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ /*
+ * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
+ * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
+ */
+ if (!rdp->nocb_cb_kthread) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ } else {
+ /*
+ * No kthread to clear the flags for us or remove the rdp from the nocb list
+ * to iterate. Do it here instead. Locking doesn't look stricly necessary
+ * but we stick to paranoia in this rare path.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ list_del(&rdp->nocb_entry_rdp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
/*
* Lock one last time to acquire latest callback updates from kthreads
* so we can later handle callbacks locally without locking.
@@ -1035,7 +1095,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return ret;
+ return 0;
}
int rcu_nocb_cpu_deoffload(int cpu)
@@ -1043,8 +1103,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
- mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
if (cpu_online(cpu)) {
ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
@@ -1055,8 +1115,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
ret = -EINVAL;
}
}
- cpus_read_unlock();
mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
return ret;
}
@@ -1067,7 +1127,8 @@ static long rcu_nocb_rdp_offload(void *arg)
struct rcu_data *rdp = arg;
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int ret;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
/*
@@ -1077,17 +1138,10 @@ static long rcu_nocb_rdp_offload(void *arg)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
- pr_info("Offloading %d\n", rdp->cpu);
+ if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ return -EINVAL;
- /*
- * Cause future nocb_gp_wait() invocations to iterate over
- * structure, resetting ->nocb_gp_sleep and waking up the related
- * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock
- * before setting ->nocb_gp_sleep again, we are guaranteed to
- * iterate this newly added structure before "rcuog" goes to
- * sleep again.
- */
- list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
+ pr_info("Offloading %d\n", rdp->cpu);
/*
* Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
@@ -1111,7 +1165,9 @@ static long rcu_nocb_rdp_offload(void *arg)
* WRITE flags READ callbacks
* rcu_nocb_unlock() rcu_nocb_unlock()
*/
- ret = rdp_offload_toggle(rdp, true, flags);
+ wake_gp = rdp_offload_toggle(rdp, true, flags);
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
@@ -1124,7 +1180,7 @@ static long rcu_nocb_rdp_offload(void *arg)
rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
rcu_nocb_unlock_irqrestore(rdp, flags);
- return ret;
+ return 0;
}
int rcu_nocb_cpu_offload(int cpu)
@@ -1132,8 +1188,8 @@ int rcu_nocb_cpu_offload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
- mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
if (!rcu_rdp_is_offloaded(rdp)) {
if (cpu_online(cpu)) {
ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
@@ -1144,8 +1200,8 @@ int rcu_nocb_cpu_offload(int cpu)
ret = -EINVAL;
}
}
- cpus_read_unlock();
mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
return ret;
}
@@ -1155,11 +1211,21 @@ void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = false;
+ bool offload_all = false;
struct rcu_data *rdp;
+#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL)
+ if (!rcu_state.nocb_is_setup) {
+ need_rcu_nocb_mask = true;
+ offload_all = true;
+ }
+#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */
+
#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) {
need_rcu_nocb_mask = true;
+ offload_all = false; /* NO_HZ_FULL has its own mask. */
+ }
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (need_rcu_nocb_mask) {
@@ -1180,6 +1246,9 @@ void __init rcu_init_nohz(void)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+ if (offload_all)
+ cpumask_setall(rcu_nocb_mask);
+
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
@@ -1246,7 +1315,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
"rcuog/%d", rdp_gp->cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
- return;
+ goto end;
}
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
if (kthread_prio)
@@ -1258,12 +1327,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
+ goto end;
- if (kthread_prio)
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+ return;
+end:
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ rcu_nocb_rdp_deoffload(rdp);
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c8ba0fe17267..438ecae6bd7e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* be quite short, for example, in the case of the call from
* rcu_read_unlock_special().
*/
-static void
+static notrace void
rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
@@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* is disabled. This function cannot be expected to understand these
* nuances, so the caller must handle them.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
@@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
* evaluate safety in terms of interrupt, softirq, and preemption
* disabling.
*/
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
@@ -899,8 +899,8 @@ void rcu_note_context_switch(bool preempt)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
- rcu_tasks_qs(current, preempt);
out:
+ rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* Because there is no preemptible RCU, there can be no deferred quiescent
* states.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
@@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
// period for a quiescent state from this CPU. Note that requests from
// tasks are handled when removing the task from the blocked-tasks list
// below.
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
WRITE_ONCE(rdp->rcuc_activity, jiffies);
}
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return rdp->nocb_cb_kthread == current;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
+{
+ return rdp->rcu_cpu_kthread_task == current ||
+ rcu_is_callbacks_nocb_kthread(rdp);
+}
+
#ifdef CONFIG_RCU_BOOST
/*
@@ -1140,7 +1159,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
+ (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+ IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1151,15 +1171,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
}
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1242,11 +1253,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
@@ -1290,37 +1296,3 @@ static void rcu_bind_gp_kthread(void)
return;
housekeeping_affine(current, HK_TYPE_RCU);
}
-
-/* Record the current task on dyntick-idle entry. */
-static __always_inline void rcu_dynticks_task_enter(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Record no current task on dyntick-idle exit. */
-static __always_inline void rcu_dynticks_task_exit(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static __always_inline void rcu_dynticks_task_trace_enter(void)
-{
-#ifdef CONFIG_TASKS_TRACE_RCU
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-}
-
-/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static __always_inline void rcu_dynticks_task_trace_exit(void)
-{
-#ifdef CONFIG_TASKS_TRACE_RCU
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4995c078cff9..c3fbbcc09327 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -409,7 +409,19 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
{
- unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity);
+ int cpu;
+ struct task_struct *rcuc;
+ unsigned long j;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return false;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return false;
+
+ j = jiffies - READ_ONCE(rdp->rcuc_activity);
if (jp)
*jp = j;
@@ -434,6 +446,9 @@ static void print_cpu_stall_info(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
char *ticks_title;
unsigned long ticks_value;
+ bool rcuc_starved;
+ unsigned long j;
+ char buf[32];
/*
* We could be printing a lot while holding a spinlock. Avoid
@@ -450,8 +465,11 @@ static void print_cpu_stall_info(int cpu)
}
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
+ rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
+ if (rcuc_starved)
+ sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -460,36 +478,14 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+ rcuc_starved ? buf : "",
falsepositive ? " (false positive?)" : "");
}
-static void rcuc_kthread_dump(struct rcu_data *rdp)
-{
- int cpu;
- unsigned long j;
- struct task_struct *rcuc;
-
- rcuc = rdp->rcu_cpu_kthread_task;
- if (!rcuc)
- return;
-
- cpu = task_cpu(rcuc);
- if (cpu_is_offline(cpu) || idle_cpu(cpu))
- return;
-
- if (!rcu_is_rcuc_kthread_starving(rdp, &j))
- return;
-
- pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
- sched_show_task(rcuc);
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
-}
-
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
@@ -647,7 +643,6 @@ static void print_cpu_stall(unsigned long gps)
* See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
- printk_prefer_direct_enter();
trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
@@ -662,9 +657,6 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
- if (!use_softirq)
- rcuc_kthread_dump(rdp);
-
rcu_dump_cpu_stacks();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -685,7 +677,6 @@ static void print_cpu_stall(unsigned long gps)
*/
set_tsk_need_resched(current);
set_preempt_need_resched();
- printk_prefer_direct_exit();
}
static void check_cpu_stall(struct rcu_data *rdp)
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fc7fef575606..738842c4886b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -85,7 +85,7 @@ module_param(rcu_normal_after_boot, int, 0444);
* and while lockdep is disabled.
*
* Note that if the CPU is in the idle loop from an RCU point of view (ie:
- * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * that we are in the section between ct_idle_enter() and ct_idle_exit())
* then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an
* rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
* in such a section, considering these as in extended quiescent state,
@@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
+/**
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
+ *
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
+ */
+unsigned long get_completed_synchronize_rcu(void)
+{
+ return RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
+
#ifdef CONFIG_PROVE_RCU
/*
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b5a71d1ff603..3c35445bf5ad 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -819,11 +819,9 @@ static int __orderly_reboot(void)
ret = run_cmd(reboot_cmd);
if (ret) {
- printk_prefer_direct_enter();
pr_warn("Failed to start orderly reboot: forcing the issue\n");
emergency_sync();
kernel_restart(NULL);
- printk_prefer_direct_exit();
}
return ret;
@@ -836,7 +834,6 @@ static int __orderly_poweroff(bool force)
ret = run_cmd(poweroff_cmd);
if (ret && force) {
- printk_prefer_direct_enter();
pr_warn("Failed to start orderly shutdown: forcing the issue\n");
/*
@@ -846,7 +843,6 @@ static int __orderly_poweroff(bool force)
*/
emergency_sync();
kernel_power_off();
- printk_prefer_direct_exit();
}
return ret;
@@ -904,8 +900,6 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
*/
static void hw_failure_emergency_poweroff_func(struct work_struct *work)
{
- printk_prefer_direct_enter();
-
/*
* We have reached here after the emergency shutdown waiting period has
* expired. This means orderly_poweroff has not been able to shut off
@@ -922,8 +916,6 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
*/
pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
emergency_restart();
-
- printk_prefer_direct_exit();
}
static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
@@ -962,13 +954,11 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
{
static atomic_t allow_proceed = ATOMIC_INIT(1);
- printk_prefer_direct_enter();
-
pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
/* Shutdown should be initiated only once. */
if (!atomic_dec_and_test(&allow_proceed))
- goto out;
+ return;
/*
* Queue a backup emergency shutdown in the event of
@@ -976,8 +966,6 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
*/
hw_failure_emergency_poweroff(ms_until_forced);
orderly_poweroff(true);
-out:
- printk_prefer_direct_exit();
}
EXPORT_SYMBOL_GPL(hw_protection_shutdown);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 97ac20b4f738..bda8175f8f99 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,8 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
-#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
/*
*
@@ -175,23 +176,15 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
u32 flags, event_mask;
int ret;
+ if (WARN_ON_ONCE(cs_flags & RSEQ_CS_NO_RESTART_FLAGS) || cs_flags)
+ return -EINVAL;
+
/* Get thread flags. */
ret = get_user(flags, &t->rseq->flags);
if (ret)
return ret;
- /* Take critical section flags into account. */
- flags |= cs_flags;
-
- /*
- * Restart on signal can only be inhibited when restart on
- * preempt and restart on migrate are inhibited too. Otherwise,
- * a preempted signal handler could fail to restart the prior
- * execution context on sigreturn.
- */
- if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
- (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
- RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
+ if (WARN_ON_ONCE(flags & RSEQ_CS_NO_RESTART_FLAGS) || flags)
return -EINVAL;
/*
@@ -203,7 +196,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
t->rseq_event_mask = 0;
preempt_enable();
- return !!(event_mask & ~flags);
+ return !!event_mask;
}
static int clear_rseq_cs(struct task_struct *t)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfa7452ca92e..189999007f32 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,7 @@
#include "stats.h"
#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
+#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
/*
@@ -873,15 +873,11 @@ static inline void hrtick_rq_init(struct rq *rq)
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -890,7 +886,7 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -905,30 +901,28 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
+ if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
- val = old;
}
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -3808,7 +3802,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(int cpu)
{
/*
* Do not complicate things with the async wake_list while the CPU is
@@ -3824,13 +3818,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -3838,10 +3840,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -4163,7 +4162,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock;
/*
@@ -4264,6 +4263,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
}
/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU. If the same task is running on that CPU throughout,
+ * the return value will be a pointer to that task's task_struct structure.
+ * If the CPU did any context switches even vaguely concurrently with the
+ * execution of this function, the return value will be a pointer to the
+ * task_struct structure of a randomly chosen task that was running on
+ * that CPU somewhere around the time that this function was executing.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee. Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+ struct task_struct *t;
+
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ t = rcu_dereference(cpu_curr(cpu));
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ return t;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -4753,7 +4784,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
- * See the ttwu() WF_ON_CPU case and its ordering comment.
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
#endif
@@ -4798,25 +4830,55 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
static void balance_push(struct rq *rq);
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
struct callback_head balance_push_callback = {
.next = NULL,
.func = (void (*)(struct callback_head *))balance_push,
};
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct callback_head *
+__splice_balance_callbacks(struct rq *rq, bool split)
{
struct callback_head *head = rq->balance_callback;
+ if (likely(!head))
+ return NULL;
+
lockdep_assert_rq_held(rq);
- if (head)
+ /*
+ * Must not take balance_push_callback off the list when
+ * splice_balance_callbacks() and balance_callbacks() are not
+ * in the same rq->lock section.
+ *
+ * In that case it would be possible for __schedule() to interleave
+ * and observe the list empty.
+ */
+ if (split && head == &balance_push_callback)
+ head = NULL;
+ else
rq->balance_callback = NULL;
return head;
}
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ return __splice_balance_callbacks(rq, true);
+}
+
static void __balance_callbacks(struct rq *rq)
{
- do_balance_callbacks(rq, splice_balance_callbacks(rq));
+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
@@ -6470,8 +6532,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
io_wq_worker_sleeping(tsk);
}
- if (tsk_is_pi_blocked(tsk))
- return;
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -6529,7 +6595,7 @@ void __sched schedule_idle(void)
} while (need_resched());
}
-#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -6968,17 +7034,29 @@ out_unlock:
EXPORT_SYMBOL(set_user_nice);
/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
@@ -7107,12 +7185,14 @@ struct task_struct *idle_task(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p)
{
- unsigned long dl_util, util, irq;
+ unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
+ max = arch_scale_cpu_capacity(cpu);
+
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
@@ -7192,10 +7272,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
return min(max, util);
}
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
- ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -7257,6 +7336,69 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7298,58 +7440,11 @@ recheck:
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
@@ -9501,7 +9596,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
void __init sched_init(void)
{
@@ -9550,7 +9645,7 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 38a2cec21014..93878cb2a46d 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -56,7 +56,6 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
unsigned long old_cookie;
struct rq_flags rf;
struct rq *rq;
- bool enqueued;
rq = task_rq_lock(p, &rf);
@@ -68,14 +67,16 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
*/
SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
- enqueued = sched_core_enqueued(p);
- if (enqueued)
+ if (sched_core_enqueued(p))
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
- if (enqueued)
+ /*
+ * Consider the cases: !prev_cookie and !cookie.
+ */
+ if (cookie && task_on_rq_queued(p))
sched_core_enqueue(rq, p);
/*
@@ -277,7 +278,11 @@ void __sched_core_account_forceidle(struct rq *rq)
if (p == rq_i->idle)
continue;
- __schedstat_add(p->stats.core_forceidle_sum, delta);
+ /*
+ * Note: this will account forceidle to the current cpu, even
+ * if it comes from our SMT sibling.
+ */
+ __account_forceidle_time(p, delta);
}
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3dbf351d12d5..1207c78f85c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -157,11 +157,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
- sg_cpu->max = max;
+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 78a233d43757..95fc77853743 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -226,6 +226,21 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5152961b743..0ab79d819a0d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -30,14 +30,16 @@ static struct ctl_table sched_dl_sysctls[] = {
.data = &sysctl_sched_dl_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
},
{
.procname = "sched_deadline_period_min_us",
.data = &sysctl_sched_dl_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
},
{}
};
@@ -1701,7 +1703,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* the throttle.
*/
p->dl.dl_throttled = 0;
- BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
+ if (!(flags & ENQUEUE_REPLENISH))
+ printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
+ task_pid_nr(p));
+
return;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77b2048a9326..914096c5b1ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -612,11 +612,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
/* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime,
+ max_vruntime(cfs_rq->min_vruntime, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -1055,6 +1052,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1548,8 +1572,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1790,6 +1812,15 @@ static bool task_numa_compare(struct task_numa_env *env,
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
+ goto unlock;
+
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
@@ -2885,6 +2916,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
@@ -3144,6 +3176,8 @@ void reweight_task(struct task_struct *p, int prio)
load->inv_weight = sched_prio_to_wmult[prio];
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
@@ -3254,8 +3288,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -3313,6 +3345,34 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
#ifdef CONFIG_SMP
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+ if (sa->load_sum)
+ return false;
+
+ if (sa->util_sum)
+ return false;
+
+ if (sa->runnable_sum)
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(sa->load_avg ||
+ sa->util_avg ||
+ sa->runnable_avg);
+
+ return true;
+}
+
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
@@ -3345,27 +3405,12 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->load.weight)
return false;
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_sum)
+ if (!load_avg_is_decayed(&cfs_rq->avg))
return false;
if (child_cfs_rq_on_list(cfs_rq))
return false;
- /*
- * _avg must be null when _sum are null because _avg = _sum / divider
- * Make sure that rounding and/or propagation of PELT values never
- * break this.
- */
- SCHED_WARN_ON(cfs_rq->avg.load_avg ||
- cfs_rq->avg.util_avg ||
- cfs_rq->avg.runnable_avg);
-
return true;
}
@@ -3423,27 +3468,9 @@ void set_task_rq_fair(struct sched_entity *se,
if (!(se->avg.last_update_time && prev))
return;
-#ifndef CONFIG_64BIT
- {
- u64 p_last_update_time_copy;
- u64 n_last_update_time_copy;
-
- do {
- p_last_update_time_copy = prev->load_last_update_time_copy;
- n_last_update_time_copy = next->load_last_update_time_copy;
-
- smp_rmb();
-
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- } while (p_last_update_time != p_last_update_time_copy ||
- n_last_update_time != n_last_update_time_copy);
- }
-#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3722,6 +3749,89 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 throttled = 0, now, lut;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ if (load_avg_is_decayed(&se->avg))
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
+
+ /*
+ * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
+ *
+ * last_update_time (the cfs_rq's last_update_time)
+ * = cfs_rq_clock_pelt()@cfs_rq_idle
+ * = rq_clock_pelt()@cfs_rq_idle
+ * - cfs->throttled_clock_pelt_time@cfs_rq_idle
+ *
+ * cfs_idle_lag (delta between rq's update and cfs_rq's update)
+ * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
+ *
+ * rq_idle_lag (delta between now and rq's update)
+ * = sched_clock_cpu() - rq_clock()@rq_idle
+ *
+ * We can then write:
+ *
+ * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+ * sched_clock_cpu() - rq_clock()@rq_idle
+ * Where:
+ * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+ * rq_clock()@rq_idle is rq->clock_idle
+ * cfs->throttled_clock_pelt_time@cfs_rq_idle
+ * is cfs_rq->throttled_pelt_idle
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
+ /* The clock has been stopped for throttling */
+ if (throttled == U64_MAX)
+ return;
+#endif
+ now = u64_u32_load(rq->clock_pelt_idle);
+ /*
+ * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+ * is observed the old clock_pelt_idle value and the new clock_idle,
+ * which lead to an underestimation. The opposite would lead to an
+ * overestimation.
+ */
+ smp_rmb();
+ lut = cfs_rq_last_update_time(cfs_rq);
+
+ now -= throttled;
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation, let's use it.
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
@@ -3796,12 +3906,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
-#endif
-
+ u64_u32_store_copy(sa->last_update_time,
+ cfs_rq->last_update_time_copy,
+ sa->last_update_time);
return decayed;
}
@@ -3933,27 +4040,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
}
}
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- u64 last_update_time_copy;
- u64 last_update_time;
-
- do {
- last_update_time_copy = cfs_rq->load_last_update_time_copy;
- smp_rmb();
- last_update_time = cfs_rq->avg.last_update_time;
- } while (last_update_time != last_update_time_copy);
-
- return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.last_update_time;
-}
-#endif
-
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
@@ -4368,16 +4454,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4477,6 +4558,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
+
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
}
/*
@@ -4992,11 +5076,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- /* Nothing to run but something to decay (on_list)? Complete the branch */
if (!cfs_rq->load.weight) {
- if (cfs_rq->on_list)
- goto unthrottle_throttle;
- return;
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
}
task_delta = cfs_rq->h_nr_running;
@@ -5034,31 +5125,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(qcfs_rq))
- list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
- /*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
- */
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(qcfs_rq))
- break;
- }
-
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -5713,13 +5785,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -5743,21 +5808,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);
enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -5844,7 +5894,7 @@ dequeue_throttle:
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -6334,8 +6384,9 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct sched_domain_shared *sd_share;
struct rq *this_rq = this_rq();
int this = smp_processor_id();
struct sched_domain *this_sd;
@@ -6375,6 +6426,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this);
}
+ if (sched_feat(SIS_UTIL)) {
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ if (sd_share) {
+ /* because !--nr is the condition to stop scan */
+ nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
+ }
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -6420,7 +6482,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
int cpu, best_cpu = -1;
struct cpumask *cpus;
- cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
task_util = uclamp_task_util(p);
@@ -6470,7 +6532,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
/*
- * per-cpu select_idle_mask usage
+ * per-cpu select_rq_mask usage
*/
lockdep_assert_irqs_disabled();
@@ -6640,62 +6702,96 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
/*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ * placement. Given by eenv_task_busy_time().
+ * @pd_busy_time: Utilization of the whole perf domain without the task
+ * contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap: Maximum CPU capacity for the perf domain.
+ * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+ unsigned long task_busy_time;
+ unsigned long pd_busy_time;
+ unsigned long cpu_cap;
+ unsigned long pd_cap;
+};
+
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
+ */
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+ struct task_struct *p, int prev_cpu)
+{
+ unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+ unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
+
+ if (unlikely(irq >= max_cap))
+ busy_time = max_cap;
+ else
+ busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
+
+ eenv->task_busy_time = busy_time;
+}
+
+/*
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util_next()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ * - A stable PD utilization, no matter which CPU of that PD we want to place
+ * the task on.
+ *
+ * - A fair comparison between CPUs as the task contribution (task_util())
+ * will always be the same no matter which CPU utilization we rely on
+ * (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
*/
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+ struct cpumask *pd_cpus,
+ struct task_struct *p)
{
- struct cpumask *pd_mask = perf_domain_span(pd);
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- unsigned long max_util = 0, sum_util = 0;
- unsigned long _cpu_cap = cpu_cap;
+ unsigned long busy_time = 0;
int cpu;
- _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+ for_each_cpu(cpu, pd_cpus) {
+ unsigned long util = cpu_util_next(cpu, p, -1);
- /*
- * The capacity state of CPUs of the current rd can be driven by CPUs
- * of another rd if they belong to the same pd. So, account for the
- * utilization of these CPUs too by masking pd with cpu_online_mask
- * instead of the rd span.
- *
- * If an entire pd is outside of the current rd, it will not appear in
- * its pd list and will not be accounted by compute_energy().
- */
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util, util_running = util_freq;
- struct task_struct *tsk = NULL;
+ busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+ }
- /*
- * When @p is placed on @cpu:
- *
- * util_running = max(cpu_util, cpu_util_est) +
- * max(task_util, _task_util_est)
- *
- * while cpu_util_next is: max(cpu_util + task_util,
- * cpu_util_est + _task_util_est)
- */
- if (cpu == dst_cpu) {
- tsk = p;
- util_running =
- cpu_util_next(cpu, p, -1) + task_util_est(p);
- }
+ eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
+}
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
- ENERGY_UTIL, NULL);
+/*
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
+ */
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+ struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = 0;
+ int cpu;
- sum_util += min(cpu_util, _cpu_cap);
+ for_each_cpu(cpu, pd_cpus) {
+ struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+ unsigned long util = cpu_util_next(cpu, p, dst_cpu);
+ unsigned long cpu_util;
/*
* Performance domain frequency: utilization clamping
@@ -6704,12 +6800,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, min(cpu_util, _cpu_cap));
+ cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
+ return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+ struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long busy_time = eenv->pd_busy_time;
+
+ if (dst_cpu >= 0)
+ busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+ return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
}
/*
@@ -6753,12 +6866,13 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- int cpu, best_energy_cpu = prev_cpu, target = -1;
- unsigned long cpu_cap, util, base_energy = 0;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
struct sched_domain *sd;
struct perf_domain *pd;
+ struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
@@ -6781,20 +6895,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!task_util_est(p))
goto unlock;
+ eenv_task_busy_time(&eenv, p, prev_cpu);
+
for (; pd; pd = pd->next) {
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cur_delta, max_spare_cap = 0;
bool compute_prev_delta = false;
- unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ unsigned long base_energy;
+
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+ if (cpumask_empty(cpus))
+ continue;
+
+ /* Account thermal pressure for the energy estimation */
+ cpu = cpumask_first(cpus);
+ cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
+ cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+
+ eenv.cpu_cap = cpu_thermal_cap;
+ eenv.pd_cap = 0;
+
+ for_each_cpu(cpu, cpus) {
+ eenv.pd_cap += cpu_thermal_cap;
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap;
- lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
@@ -6807,15 +6940,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!fits_capacity(util, cpu_cap))
continue;
+ lsub_positive(&cpu_cap, util);
+
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
compute_prev_delta = true;
- } else if (spare_cap > max_spare_cap) {
+ } else if (cpu_cap > max_spare_cap) {
/*
* Find the CPU with the maximum spare capacity
* in the performance domain.
*/
- max_spare_cap = spare_cap;
+ max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
}
}
@@ -6823,25 +6958,29 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (max_spare_cap_cpu < 0 && !compute_prev_delta)
continue;
+ eenv_pd_busy_time(&eenv, cpus, p);
/* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
+ base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
if (compute_prev_delta) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- if (prev_delta < base_energy_pd)
+ prev_delta = compute_energy(&eenv, pd, cpus, p,
+ prev_cpu);
+ /* CPU utilization has changed */
+ if (prev_delta < base_energy)
goto unlock;
- prev_delta -= base_energy_pd;
+ prev_delta -= base_energy;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0) {
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
- if (cur_delta < base_energy_pd)
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
goto unlock;
- cur_delta -= base_energy_pd;
+ cur_delta -= base_energy;
if (cur_delta < best_delta) {
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
@@ -6850,12 +6989,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
rcu_read_unlock();
- /*
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
- * least 6% of the energy used by prev_cpu.
- */
- if ((prev_delta == ULONG_MAX) ||
- (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ if (best_delta < prev_delta)
target = best_energy_cpu;
return target;
@@ -6951,6 +7085,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
+ struct sched_entity *se = &p->se;
+
/*
* As blocked tasks retain absolute vruntime the migration needs to
* deal with this by subtracting the old and adding the new
@@ -6958,23 +7094,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* the task on the new runqueue.
*/
if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
- se->vruntime -= min_vruntime;
+ se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
}
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
@@ -6983,25 +7105,29 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* rq->lock and can modify state directly.
*/
lockdep_assert_rq_held(task_rq(p));
- detach_entity_cfs_rq(&p->se);
+ detach_entity_cfs_rq(se);
} else {
+ remove_entity_load_avg(se);
+
/*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the cfs_rq last_update_time
+ * and update sched_avg to improve the PELT continuity after
+ * migration.
*/
- remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
+ se->avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
+ se->exec_start = 0;
update_scan_period(p, new_cpu);
}
@@ -7585,8 +7711,8 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
@@ -8167,6 +8293,9 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
+
if (cfs_rq == &rq->cfs)
decayed = true;
}
@@ -8500,7 +8629,7 @@ static inline int sg_imbalanced(struct sched_group *group)
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
- * We consider that a group has spare capacity if the * number of task is
+ * We consider that a group has spare capacity if the number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
@@ -8669,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8691,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
- sgs->group_load += cpu_load(rq);
+ sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8722,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;
- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}
@@ -8779,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -9058,16 +9208,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
- return running <= imb_numa_nr;
-}
-
-/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
@@ -9183,7 +9323,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
@@ -9196,17 +9338,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks
* would remain below threshold where an imbalance is
- * allowed. If there is a real need of migration,
- * periodic load balance will take care of it.
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
+ * take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -9222,6 +9378,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void update_idle_cpu_scan(struct lb_env *env,
+ unsigned long sum_util)
+{
+ struct sched_domain_shared *sd_share;
+ int llc_weight, pct;
+ u64 x, y, tmp;
+ /*
+ * Update the number of CPUs to scan in LLC domain, which could
+ * be used as a hint in select_idle_cpu(). The update of sd_share
+ * could be expensive because it is within a shared cache line.
+ * So the write of this hint only occurs during periodic load
+ * balancing, rather than CPU_NEWLY_IDLE, because the latter
+ * can fire way more frequently than the former.
+ */
+ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+ if (env->sd->span_weight != llc_weight)
+ return;
+
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ if (!sd_share)
+ return;
+
+ /*
+ * The number of CPUs to search drops as sum_util increases, when
+ * sum_util hits 85% or above, the scan stops.
+ * The reason to choose 85% as the threshold is because this is the
+ * imbalance_pct(117) when a LLC sched group is overloaded.
+ *
+ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
+ * and y'= y / SCHED_CAPACITY_SCALE
+ *
+ * x is the ratio of sum_util compared to the CPU capacity:
+ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+ * y' is the ratio of CPUs to be scanned in the LLC domain,
+ * and the number of CPUs to scan is calculated by:
+ *
+ * nr_scan = llc_weight * y' [2]
+ *
+ * When x hits the threshold of overloaded, AKA, when
+ * x = 100 / pct, y drops to 0. According to [1],
+ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+ *
+ * Scale x by SCHED_CAPACITY_SCALE:
+ * x' = sum_util / llc_weight; [3]
+ *
+ * and finally [1] becomes:
+ * y = SCHED_CAPACITY_SCALE -
+ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
+ *
+ */
+ /* equation [3] */
+ x = sum_util;
+ do_div(x, llc_weight);
+
+ /* equation [4] */
+ pct = env->sd->imbalance_pct;
+ tmp = x * x * pct * pct;
+ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+ y = SCHED_CAPACITY_SCALE - tmp;
+
+ /* equation [2] */
+ y *= llc_weight;
+ do_div(y, SCHED_CAPACITY_SCALE);
+ if ((int)y != sd_share->nr_idle_scan)
+ WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -9234,6 +9461,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
+ unsigned long sum_util = 0;
int sg_status = 0;
do {
@@ -9266,6 +9494,7 @@ next_group:
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sum_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -9291,24 +9520,8 @@ next_group:
WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
-}
-
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr)
-{
- if (!allow_numa_imbalance(dst_running, imb_numa_nr))
- return imbalance;
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the destination is lightly loaded.
- */
- if (imbalance <= NUMA_IMBALANCE_MIN)
- return 0;
-
- return imbalance;
+ update_idle_cpu_scan(env, sum_util);
}
/**
@@ -9325,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}
@@ -9395,7 +9617,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = nr_diff;
} else {
/*
@@ -9403,15 +9625,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- local->sum_nr_running + 1, env->sd->imb_numa_nr);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
}
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
@@ -9834,9 +10062,15 @@ static int should_we_balance(struct lb_env *env)
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
@@ -11287,9 +11521,13 @@ static inline bool vruntime_normalized(struct task_struct *p)
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
- list_add_leaf_cfs_rq(cfs_rq_of(se));
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
@@ -11297,14 +11535,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (!cfs_rq_throttled(cfs_rq)){
- update_load_avg(cfs_rq, se, UPDATE_TG);
- list_add_leaf_cfs_rq(cfs_rq);
- continue;
- }
+ update_load_avg(cfs_rq, se, UPDATE_TG);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (cfs_rq_throttled(cfs_rq))
break;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
}
#else
@@ -11422,10 +11659,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..ee7f23c76bd3 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -60,7 +60,8 @@ SCHED_FEAT(TTWU_QUEUE, true)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_PROP, false)
+SCHED_FEAT(SIS_UTIL, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 328cccbee444..f26ab2675f7d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -53,14 +53,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
{
trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
- rcu_idle_enter();
+ ct_idle_enter();
local_irq_enable();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
- rcu_idle_exit();
+ ct_idle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
@@ -98,12 +98,12 @@ void __cpuidle default_idle_call(void)
*
* Trace IRQs enable here, then switch off RCU, and have
* arch_cpu_idle() use raw_local_irq_enable(). Note that
- * rcu_idle_enter() relies on lockdep IRQ state, so switch that
+ * ct_idle_enter() relies on lockdep IRQ state, so switch that
* last -- this is very similar to the entry code.
*/
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
- rcu_idle_enter();
+ ct_idle_enter();
lockdep_hardirqs_on(_THIS_IP_);
arch_cpu_idle();
@@ -116,7 +116,7 @@ void __cpuidle default_idle_call(void)
*/
raw_local_irq_disable();
lockdep_hardirqs_off(_THIS_IP_);
- rcu_idle_exit();
+ ct_idle_exit();
lockdep_hardirqs_on(_THIS_IP_);
raw_local_irq_enable();
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 4ff2ed4f8fa1..3a0e0dc28721 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,6 +61,25 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+/* The rq is idle, we can sync to clock_task */
+static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+{
+ rq->clock_pelt = rq_clock_task(rq);
+
+ u64_u32_store(rq->clock_idle, rq_clock(rq));
+ /* Paired with smp_rmb in migrate_se_pelt_lag() */
+ smp_wmb();
+ u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
+}
+
/*
* The clock_pelt scales the time to reflect the effective amount of
* computation done during the running delta time but then sync back to
@@ -76,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
- /* The rq is idle, we can sync to clock_task */
- rq->clock_pelt = rq_clock_task(rq);
+ _update_idle_rq_clock_pelt(rq);
return;
}
@@ -130,17 +148,23 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+
+ _update_idle_rq_clock_pelt(rq);
}
-static inline u64 rq_clock_pelt(struct rq *rq)
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- lockdep_assert_rq_held(rq);
- assert_clock_updated(rq);
+ u64 throttled;
- return rq->clock_pelt - rq->lost_idle_time;
+ if (unlikely(cfs_rq->throttle_count))
+ throttled = U64_MAX;
+ else
+ throttled = cfs_rq->throttled_clock_pelt_time;
+
+ u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
-#ifdef CONFIG_CFS_BANDWIDTH
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
@@ -150,6 +174,7 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
@@ -204,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
static inline void
update_idle_rq_clock_pelt(struct rq *rq) { }
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
#endif
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a337f3e35997..ec66b40bdd40 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -957,10 +957,16 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return 0;
- cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
- if (!cgroup->psi.pcpu)
+ cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL);
+ if (!cgroup->psi)
return -ENOMEM;
- group_init(&cgroup->psi);
+
+ cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi->pcpu) {
+ kfree(cgroup->psi);
+ return -ENOMEM;
+ }
+ group_init(cgroup->psi);
return 0;
}
@@ -969,10 +975,11 @@ void psi_cgroup_free(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return;
- cancel_delayed_work_sync(&cgroup->psi.avgs_work);
- free_percpu(cgroup->psi.pcpu);
+ cancel_delayed_work_sync(&cgroup->psi->avgs_work);
+ free_percpu(cgroup->psi->pcpu);
/* All triggers must be removed by now */
- WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
+ WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
+ kfree(cgroup->psi);
}
/**
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8c9ed9664840..55f39c8f4203 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -480,7 +480,7 @@ static inline void rt_queue_push_tasks(struct rq *rq)
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
@@ -601,7 +601,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se) {
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
@@ -687,7 +687,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1089,7 +1089,7 @@ static void update_curr_rt(struct rq *rq)
}
static void
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -1100,7 +1100,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
BUG_ON(!rq->nr_running);
- sub_nr_running(rq, rt_rq->rt_nr_running);
+ sub_nr_running(rq, count);
rt_rq->rt_queued = 0;
}
@@ -1486,18 +1486,21 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
+ unsigned int rt_nr_running;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
}
- dequeue_top_rt_rq(rt_rq_of_se(back));
+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se, flags);
}
+
+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 01259611beb9..a6f071b2acac 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -27,6 +27,7 @@
#include <linux/capability.h>
#include <linux/cgroup_api.h>
#include <linux/cgroup.h>
+#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
#include <linux/cpumask_api.h>
#include <linux/ctype.h>
@@ -520,6 +521,45 @@ struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
+/*
+ * u64_u32_load/u64_u32_store
+ *
+ * Use a copy of a u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#ifdef CONFIG_64BIT
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
+#else
+# define u64_u32_load_copy(var, copy) \
+({ \
+ u64 __val, __val_copy; \
+ do { \
+ __val_copy = copy; \
+ /* \
+ * paired with u64_u32_store_copy(), ordering access \
+ * to var and copy. \
+ */ \
+ smp_rmb(); \
+ __val = var; \
+ } while (__val != __val_copy); \
+ __val; \
+})
+# define u64_u32_store_copy(var, copy, val) \
+do { \
+ typeof(val) __val = (val); \
+ var = __val; \
+ /* \
+ * paired with u64_u32_load_copy(), ordering access to var and \
+ * copy. \
+ */ \
+ smp_wmb(); \
+ copy = __val; \
+} while (0)
+#endif
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -560,7 +600,7 @@ struct cfs_rq {
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
- u64 load_last_update_time_copy;
+ u64 last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
@@ -609,6 +649,10 @@ struct cfs_rq {
int runtime_enabled;
s64 runtime_remaining;
+ u64 throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+ u64 throttled_pelt_idle_copy;
+#endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -981,6 +1025,12 @@ struct rq {
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
+ u64 clock_pelt_idle;
+ u64 clock_idle;
+#ifndef CONFIG_64BIT
+ u64 clock_pelt_idle_copy;
+ u64 clock_idle_copy;
+#endif
atomic_t nr_iowait;
@@ -1693,6 +1743,11 @@ queue_balance_callback(struct rq *rq,
{
lockdep_assert_rq_held(rq);
+ /*
+ * Don't (re)queue an already queued item; nor queue anything when
+ * balance_push() is active, see the comment with
+ * balance_push_callback.
+ */
if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
@@ -1810,15 +1865,6 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
return to_cpumask(sg->sgc->cpumask);
}
-/**
- * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
- * @group: The group whose first CPU is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_span(group));
-}
-
extern int group_balance_cpu(struct sched_group *sg);
#ifdef CONFIG_SCHED_DEBUG
@@ -2039,7 +2085,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
-#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2847,7 +2892,7 @@ enum cpu_util_type {
};
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05b6c2ad90b9..8739c2a5a54e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2316,23 +2316,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/*
* For a single LLC per node, allow an
- * imbalance up to 25% of the node. This is an
- * arbitrary cutoff based on SMT-2 to balance
- * between memory bandwidth and avoiding
- * premature sharing of HT resources and SMT-4
- * or SMT-8 *may* benefit from a different
- * cutoff.
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
*
* For multiple LLCs, allow an imbalance
* until multiple tasks would share an LLC
* on one node while LLCs on another node
- * remain idle.
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
*/
nr_llcs = sd->span_weight / child->span_weight;
if (nr_llcs == 1)
- imb = sd->span_weight >> 2;
+ imb = sd->span_weight >> 3;
else
imb = nr_llcs;
+ imb = max(1U, imb);
sd->imb_numa_nr = imb;
/* Set span based on the first NUMA domain. */
diff --git a/kernel/signal.c b/kernel/signal.c
index edb1dc9b00dc..6f86fda5e432 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2029,12 +2029,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
bool autoreap = false;
u64 utime, stime;
- BUG_ON(sig == -1);
+ WARN_ON_ONCE(sig == -1);
- /* do_notify_parent_cldstop should have been called instead. */
- BUG_ON(task_is_stopped_or_traced(tsk));
+ /* do_notify_parent_cldstop should have been called instead. */
+ WARN_ON_ONCE(task_is_stopped_or_traced(tsk));
- BUG_ON(!tsk->ptrace &&
+ WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/* Wake up all pidfd waiters */
diff --git a/kernel/smp.c b/kernel/smp.c
index dd215f439426..650810a6f29b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str)
if (val)
static_branch_enable(&csdlock_debug_enabled);
- return 0;
+ return 1;
}
-early_param("csdlock_debug", csdlock_debug);
+__setup("csdlock_debug=", csdlock_debug);
static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9f0aef8aa9ff..c8a6913c067d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -620,7 +620,7 @@ void irq_enter_rcu(void)
*/
void irq_enter(void)
{
- rcu_irq_enter();
+ ct_irq_enter();
irq_enter_rcu();
}
@@ -672,7 +672,7 @@ void irq_exit_rcu(void)
void irq_exit(void)
{
__irq_exit_rcu();
- rcu_irq_exit();
+ ct_irq_exit();
/* must be last! */
lockdep_hardirq_exit();
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e52b6e372c60..35d034219513 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -446,14 +446,14 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
if (*negp) {
if (*lvalp > (unsigned long) INT_MAX + 1)
return -EINVAL;
- *valp = -*lvalp;
+ WRITE_ONCE(*valp, -*lvalp);
} else {
if (*lvalp > (unsigned long) INT_MAX)
return -EINVAL;
- *valp = *lvalp;
+ WRITE_ONCE(*valp, *lvalp);
}
} else {
- int val = *valp;
+ int val = READ_ONCE(*valp);
if (val < 0) {
*negp = true;
*lvalp = -(unsigned long)val;
@@ -472,9 +472,9 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
if (write) {
if (*lvalp > UINT_MAX)
return -EINVAL;
- *valp = *lvalp;
+ WRITE_ONCE(*valp, *lvalp);
} else {
- unsigned int val = *valp;
+ unsigned int val = READ_ONCE(*valp);
*lvalp = (unsigned long)val;
}
return 0;
@@ -857,7 +857,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
if ((param->min && *param->min > tmp) ||
(param->max && *param->max < tmp))
return -EINVAL;
- *valp = tmp;
+ WRITE_ONCE(*valp, tmp);
}
return 0;
@@ -923,7 +923,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
(param->max && *param->max < tmp))
return -ERANGE;
- *valp = tmp;
+ WRITE_ONCE(*valp, tmp);
}
return 0;
@@ -1007,13 +1007,13 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write,
tmp.maxlen = sizeof(val);
tmp.data = &val;
- val = *data;
+ val = READ_ONCE(*data);
res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
do_proc_douintvec_minmax_conv, &param);
if (res)
return res;
if (write)
- *data = val;
+ WRITE_ONCE(*data, val);
return 0;
}
EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
@@ -1090,9 +1090,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
err = -EINVAL;
break;
}
- *i = val;
+ WRITE_ONCE(*i, val);
} else {
- val = convdiv * (*i) / convmul;
+ val = convdiv * READ_ONCE(*i) / convmul;
if (!first)
proc_put_char(&buffer, &left, '\t');
proc_put_long(&buffer, &left, val, false);
@@ -1173,9 +1173,12 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*lvalp > INT_MAX / HZ)
return 1;
- *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ if (*negp)
+ WRITE_ONCE(*valp, -*lvalp * HZ);
+ else
+ WRITE_ONCE(*valp, *lvalp * HZ);
} else {
- int val = *valp;
+ int val = READ_ONCE(*valp);
unsigned long lval;
if (val < 0) {
*negp = true;
@@ -1221,9 +1224,9 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
if (jif > INT_MAX)
return 1;
- *valp = (int)jif;
+ WRITE_ONCE(*valp, (int)jif);
} else {
- int val = *valp;
+ int val = READ_ONCE(*valp);
unsigned long lval;
if (val < 0) {
*negp = true;
@@ -1291,8 +1294,8 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
* @ppos: the current position in the file
*
* Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
* are converted into jiffies.
*
* Returns 0 on success.
@@ -2091,6 +2094,17 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO_HUNDRED,
},
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
#ifdef CONFIG_HUGETLB_PAGE
{
.procname = "nr_hugepages",
@@ -2107,15 +2121,6 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
},
- {
- .procname = "numa_stat",
- .data = &sysctl_vm_numa_stat,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_vm_numa_stat_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
#endif
{
.procname = "hugetlb_shm_group",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 27b7868b5c30..a41753be1a2b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -73,6 +73,15 @@ config TIME_KUNIT_TEST
If unsure, say N.
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_IDLE
+ bool
+ select CONTEXT_TRACKING
+ help
+ Tracks idle state on behalf of RCU.
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
@@ -111,7 +120,7 @@ config NO_HZ_FULL
# NO_HZ_COMMON dependency
# We need at least one periodic CPU for timekeeping
depends on SMP
- depends on HAVE_CONTEXT_TRACKING
+ depends on HAVE_CONTEXT_TRACKING_USER
# VIRT_CPU_ACCOUNTING_GEN dependency
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
@@ -137,31 +146,37 @@ config NO_HZ_FULL
endchoice
-config CONTEXT_TRACKING
- bool
+config CONTEXT_TRACKING_USER
+ bool
+ depends on HAVE_CONTEXT_TRACKING_USER
+ select CONTEXT_TRACKING
+ help
+ Track transitions between kernel and user on behalf of RCU and
+ tickless cputime accounting. The former case relies on context
+ tracking to enter/exit RCU extended quiescent states.
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
+config CONTEXT_TRACKING_USER_FORCE
+ bool "Force user context tracking"
+ depends on CONTEXT_TRACKING_USER
default y if !NO_HZ_FULL
help
The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
+ support the user context tracking subsystem. But there are also
other dependencies to provide in order to make the full
dynticks working.
This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fulfill all the
+ user context tracking backend but doesn't yet fulfill all the
requirements to make the full dynticks feature working.
Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
+ for user context tracking and the subsystems that rely on it: RCU
userspace extended quiescent state and tickless cputime
accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
+ dynticks subsystem by forcing the user context tracking on all
CPUs in the system.
Say Y only if you're working on the development of an
- architecture backend for the context tracking.
+ architecture backend for the user context tracking.
Say N otherwise, this option brings an overhead that you
don't want in production.
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1cd10b102c51..5dead89308b7 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1051,15 +1051,24 @@ retry_delete:
}
/*
- * This is called by do_exit or de_thread, only when there are no more
- * references to the shared signal_struct.
+ * This is called by do_exit or de_thread, only when nobody else can
+ * modify the signal->posix_timers list. Yet we need sighand->siglock
+ * to prevent the race with /proc/pid/timers.
*/
-void exit_itimers(struct signal_struct *sig)
+void exit_itimers(struct task_struct *tsk)
{
+ struct list_head timers;
struct k_itimer *tmr;
- while (!list_empty(&sig->posix_timers)) {
- tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+ if (list_empty(&tsk->signal->posix_timers))
+ return;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ list_replace_init(&tsk->signal->posix_timers, &timers);
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ while (!list_empty(&timers)) {
+ tmr = list_first_entry(&timers, struct k_itimer, list);
itimer_delete(tmr);
}
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 58a11f859ac7..b0e3c9205946 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -526,7 +526,6 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
}
-EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
@@ -571,7 +570,7 @@ void __init tick_nohz_init(void)
}
for_each_cpu(cpu, tick_nohz_full_mask)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"kernel/nohz:predown", NULL,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8e4b3c32fcf9..f72b9f1de178 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,6 +23,7 @@
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
+#include <linux/random.h>
#include "tick-internal.h"
#include "ntp_internal.h"
@@ -1343,8 +1344,10 @@ out:
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL);
- if (!ret)
+ if (!ret) {
audit_tk_injoffset(ts_delta);
+ add_device_randomness(ts, sizeof(*ts));
+ }
return ret;
}
@@ -2430,6 +2433,7 @@ int do_adjtimex(struct __kernel_timex *txc)
ret = timekeeping_validate_timex(txc);
if (ret)
return ret;
+ add_device_randomness(txc, sizeof(*txc));
if (txc->modes & ADJ_SETOFFSET) {
struct timespec64 delta;
@@ -2447,6 +2451,7 @@ int do_adjtimex(struct __kernel_timex *txc)
audit_ntp_init(&ad);
ktime_get_real_ts64(&ts);
+ add_device_randomness(&ts, sizeof(ts));
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index debbbb083286..ccd6a5ade3e9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -194,7 +194,8 @@ config FUNCTION_TRACER
sequence is then dynamically patched into a tracer call when
tracing is enabled by the administrator. If it's runtime disabled
(the bootup default), then the overhead of the instructions is very
- small and not measurable even in micro-benchmarks.
+ small and not measurable even in micro-benchmarks (at least on
+ x86, but may have impact on other architectures).
config FUNCTION_GRAPH_TRACER
bool "Kernel Function Graph Tracer"
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 10a32b0f2deb..7f5eb295fe19 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \
(ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/*
@@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data, u64 cgid)
+ const blk_opf_t opf, u32 what, int error,
+ int pdu_len, void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -227,16 +227,17 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+ const enum req_op op = opf & REQ_OP_MASK;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
what |= ddir_act[op_is_write(op) ? WRITE : READ];
- what |= MASK_TC_BIT(op_flags, SYNC);
- what |= MASK_TC_BIT(op_flags, RAHEAD);
- what |= MASK_TC_BIT(op_flags, META);
- what |= MASK_TC_BIT(op_flags, PREFLUSH);
- what |= MASK_TC_BIT(op_flags, FUA);
+ what |= MASK_TC_BIT(opf, SYNC);
+ what |= MASK_TC_BIT(opf, RAHEAD);
+ what |= MASK_TC_BIT(opf, META);
+ what |= MASK_TC_BIT(opf, PREFLUSH);
+ what |= MASK_TC_BIT(opf, FUA);
if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
@@ -736,12 +737,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
switch (cmd) {
case BLKTRACESETUP:
- bdevname(bdev, b);
+ snprintf(b, sizeof(b), "%pg", bdev);
ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
- bdevname(bdev, b);
+ snprintf(b, sizeof(b), "%pg", bdev);
ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#endif
@@ -770,14 +771,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
- mutex_lock(&q->debugfs_mutex);
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
-
- mutex_unlock(&q->debugfs_mutex);
}
#ifdef CONFIG_BLK_CGROUP
@@ -845,9 +843,8 @@ static void blk_add_trace_rq(struct request *rq, blk_status_t error,
else
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, blk_status_to_errno(error), 0,
- NULL, cgid);
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags,
+ what, blk_status_to_errno(error), 0, NULL, cgid);
rcu_read_unlock();
}
@@ -906,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
}
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+ bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
@@ -952,7 +949,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
rcu_read_unlock();
}
@@ -972,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
}
rcu_read_unlock();
}
@@ -988,8 +985,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT,
+ bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT,
blk_status_to_errno(bio->bi_status),
sizeof(rpdu), &rpdu,
blk_trace_bio_get_cgid(q, bio));
@@ -1025,7 +1021,7 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ bio->bi_opf, BLK_TA_REMAP,
blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
@@ -1061,7 +1057,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
+ rq->cmd_flags, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
@@ -1087,7 +1083,7 @@ void blk_add_driver_data(struct request *rq, void *data, size_t len)
return;
}
- __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(rq));
rcu_read_unlock();
@@ -1870,17 +1866,6 @@ out_unlock_bdev:
out:
return ret ? ret : count;
}
-
-int blk_trace_init_sysfs(struct device *dev)
-{
- return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
-}
-
-void blk_trace_remove_sysfs(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
-}
-
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
@@ -1888,21 +1873,21 @@ void blk_trace_remove_sysfs(struct device *dev)
/**
* blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
* @rwbs: buffer to be filled
- * @op: REQ_OP_XXX for the tracepoint
+ * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint
*
* Description:
- * Maps the REQ_OP_XXX to character and fills the buffer provided by the
- * caller with resulting string.
+ * Maps each request operation and flag to a single character and fills the
+ * buffer provided by the caller with resulting string.
*
**/
-void blk_fill_rwbs(char *rwbs, unsigned int op)
+void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
{
int i = 0;
- if (op & REQ_PREFLUSH)
+ if (opf & REQ_PREFLUSH)
rwbs[i++] = 'F';
- switch (op & REQ_OP_MASK) {
+ switch (opf & REQ_OP_MASK) {
case REQ_OP_WRITE:
rwbs[i++] = 'W';
break;
@@ -1923,13 +1908,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
rwbs[i++] = 'N';
}
- if (op & REQ_FUA)
+ if (opf & REQ_FUA)
rwbs[i++] = 'F';
- if (op & REQ_RAHEAD)
+ if (opf & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (op & REQ_SYNC)
+ if (opf & REQ_SYNC)
rwbs[i++] = 'S';
- if (op & REQ_META)
+ if (opf & REQ_META)
rwbs[i++] = 'M';
rwbs[i] = '\0';
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7a13e6ac6327..88589d74a892 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
kprobe_multi_link_prog_run(link, entry_ip, regs);
}
-static int symbols_cmp(const void *a, const void *b)
+static int symbols_cmp_r(const void *a, const void *b, const void *priv)
{
const char **str_a = (const char **) a;
const char **str_b = (const char **) b;
@@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b)
return strcmp(*str_a, *str_b);
}
+struct multi_symbols_sort {
+ const char **funcs;
+ u64 *cookies;
+};
+
+static void symbols_swap_r(void *a, void *b, int size, const void *priv)
+{
+ const struct multi_symbols_sort *data = priv;
+ const char **name_a = a, **name_b = b;
+
+ swap(*name_a, *name_b);
+
+ /* If defined, swap also related cookies. */
+ if (data->cookies) {
+ u64 *cookie_a, *cookie_b;
+
+ cookie_a = data->cookies + (name_a - data->funcs);
+ cookie_b = data->cookies + (name_b - data->funcs);
+ swap(*cookie_a, *cookie_b);
+ }
+}
+
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_kprobe_multi_link *link = NULL;
@@ -2468,38 +2490,46 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!addrs)
return -ENOMEM;
+ ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
if (uaddrs) {
if (copy_from_user(addrs, uaddrs, size)) {
err = -EFAULT;
goto error;
}
} else {
+ struct multi_symbols_sort data = {
+ .cookies = cookies,
+ };
struct user_syms us;
err = copy_user_syms(&us, usyms, cnt);
if (err)
goto error;
- sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL);
+ if (cookies)
+ data.funcs = us.syms;
+
+ sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
+ symbols_swap_r, &data);
+
err = ftrace_lookup_symbols(us.syms, cnt, addrs);
free_user_syms(&us);
if (err)
goto error;
}
- ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
- if (ucookies) {
- cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
- if (!cookies) {
- err = -ENOMEM;
- goto error;
- }
- if (copy_from_user(cookies, ucookies, size)) {
- err = -EFAULT;
- goto error;
- }
- }
-
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
err = -ENOMEM;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e750fe141a60..601ccf1b2f09 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name,
struct module *mod, unsigned long addr)
{
struct kallsyms_data *args = data;
+ const char **sym;
+ int idx;
- if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp))
+ sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp);
+ if (!sym)
+ return 0;
+
+ idx = sym - args->syms;
+ if (args->addrs[idx])
return 0;
addr = ftrace_location(addr);
if (!addr)
return 0;
- args->addrs[args->found++] = addr;
+ args->addrs[idx] = addr;
+ args->found++;
return args->found == args->cnt ? 1 : 0;
}
@@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
struct kallsyms_data args;
int err;
+ memset(addrs, 0, sizeof(*addrs) * cnt);
args.addrs = addrs;
args.syms = sorted_syms;
args.cnt = cnt;
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index b56833700d23..c69d82273ce7 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!handler))
return NULL;
+ /*
+ * This expects the caller will set up a rethook on a function entry.
+ * When the function returns, the rethook will eventually be reclaimed
+ * or released in the rethook_recycle() with call_rcu().
+ * This means the caller must be run in the RCU-availabe context.
+ */
+ if (unlikely(!rcu_is_watching()))
+ return NULL;
+
fn = freelist_try_get(&rh->pool);
if (!fn)
return NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c95992e2c71..0c517c8c8999 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3105,17 +3105,17 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
}
/*
- * When an NMI triggers, RCU is enabled via rcu_nmi_enter(),
+ * When an NMI triggers, RCU is enabled via ct_nmi_enter(),
* but if the above rcu_is_watching() failed, then the NMI
- * triggered someplace critical, and rcu_irq_enter() should
+ * triggered someplace critical, and ct_irq_enter() should
* not be called from NMI.
*/
if (unlikely(in_nmi()))
return;
- rcu_irq_enter_irqson();
+ ct_irq_enter_irqson();
__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
- rcu_irq_exit_irqson();
+ ct_irq_exit_irqson();
}
/**
@@ -6424,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
synchronize_rcu();
free_snapshot(tr);
}
-#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
@@ -9866,6 +9864,12 @@ void trace_init_global_iter(struct trace_iterator *iter)
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
if (trace_clocks[iter->tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ /* Can not use kmalloc for iter.temp and iter.fmt */
+ iter->temp = static_temp_buf;
+ iter->temp_size = STATIC_TEMP_BUF_SIZE;
+ iter->fmt = static_fmt_buf;
+ iter->fmt_size = STATIC_FMT_BUF_SIZE;
}
void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
@@ -9898,11 +9902,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* Simulate the iterator */
trace_init_global_iter(&iter);
- /* Can not use kmalloc for iter.temp and iter.fmt */
- iter.temp = static_temp_buf;
- iter.temp_size = STATIC_TEMP_BUF_SIZE;
- iter.fmt = static_fmt_buf;
- iter.fmt_size = STATIC_FMT_BUF_SIZE;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 48e82e141d54..e87a46794079 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4430,6 +4430,8 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
s = kstrdup(field_str, GFP_KERNEL);
if (!s) {
+ kfree(hist_data->attrs->var_defs.name[n_vars]);
+ hist_data->attrs->var_defs.name[n_vars] = NULL;
ret = -ENOMEM;
goto free;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 93507330462c..a245ea673715 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1718,8 +1718,17 @@ static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct kretprobe *rp = get_kretprobe(ri);
- struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp);
+ struct trace_kprobe *tk;
+
+ /*
+ * There is a small chance that get_kretprobe(ri) returns NULL when
+ * the kretprobe is unregister on another CPU between kretprobe's
+ * trampoline_handler and this function.
+ */
+ if (unlikely(!rp))
+ return 0;
+ tk = container_of(rp, struct trace_kprobe, rp);
raw_cpu_inc(*tk->nhit);
if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9711589273cd..c3dc4f859a6b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -546,7 +546,6 @@ static int __trace_uprobe_create(int argc, const char **argv)
bool is_return = false;
int i, ret;
- ret = 0;
ref_ctr_offset = 0;
switch (argv[0][0]) {
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 230038d4f908..a6f9bdd956c3 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -4,7 +4,7 @@
* Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
- * See Documentation/watch_queue.rst
+ * See Documentation/core-api/watch_queue.rst
*/
#define pr_fmt(fmt) "watchq: " fmt
@@ -34,6 +34,27 @@ MODULE_LICENSE("GPL");
#define WATCH_QUEUE_NOTE_SIZE 128
#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
+/*
+ * This must be called under the RCU read-lock, which makes
+ * sure that the wqueue still exists. It can then take the lock,
+ * and check that the wqueue hasn't been destroyed, which in
+ * turn makes sure that the notification pipe still exists.
+ */
+static inline bool lock_wqueue(struct watch_queue *wqueue)
+{
+ spin_lock_bh(&wqueue->lock);
+ if (unlikely(wqueue->defunct)) {
+ spin_unlock_bh(&wqueue->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void unlock_wqueue(struct watch_queue *wqueue)
+{
+ spin_unlock_bh(&wqueue->lock);
+}
+
static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -69,6 +90,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
/*
* Post a notification to a watch queue.
+ *
+ * Must be called with the RCU lock for reading, and the
+ * watch_queue lock held, which guarantees that the pipe
+ * hasn't been released.
*/
static bool post_one_notification(struct watch_queue *wqueue,
struct watch_notification *n)
@@ -85,9 +110,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
spin_lock_irq(&pipe->rd_wait.lock);
- if (wqueue->defunct)
- goto out;
-
mask = pipe->ring_size - 1;
head = pipe->head;
tail = pipe->tail;
@@ -203,7 +225,10 @@ void __post_watch_notification(struct watch_list *wlist,
if (security_post_notification(watch->cred, cred, n) < 0)
continue;
- post_one_notification(wqueue, n);
+ if (lock_wqueue(wqueue)) {
+ post_one_notification(wqueue, n);
+ unlock_wqueue(wqueue);
+ }
}
rcu_read_unlock();
@@ -429,6 +454,33 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue)
rcu_assign_pointer(watch->queue, wqueue);
}
+static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue)
+{
+ const struct cred *cred;
+ struct watch *w;
+
+ hlist_for_each_entry(w, &wlist->watchers, list_node) {
+ struct watch_queue *wq = rcu_access_pointer(w->queue);
+ if (wqueue == wq && watch->id == w->id)
+ return -EBUSY;
+ }
+
+ cred = current_cred();
+ if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) {
+ atomic_dec(&cred->user->nr_watches);
+ return -EAGAIN;
+ }
+
+ watch->cred = get_cred(cred);
+ rcu_assign_pointer(watch->watch_list, wlist);
+
+ kref_get(&wqueue->usage);
+ kref_get(&watch->usage);
+ hlist_add_head(&watch->queue_node, &wqueue->watches);
+ hlist_add_head_rcu(&watch->list_node, &wlist->watchers);
+ return 0;
+}
+
/**
* add_watch_to_object - Add a watch on an object to a watch list
* @watch: The watch to add
@@ -443,33 +495,21 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue)
*/
int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
{
- struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
- struct watch *w;
-
- hlist_for_each_entry(w, &wlist->watchers, list_node) {
- struct watch_queue *wq = rcu_access_pointer(w->queue);
- if (wqueue == wq && watch->id == w->id)
- return -EBUSY;
- }
+ struct watch_queue *wqueue;
+ int ret = -ENOENT;
- watch->cred = get_current_cred();
- rcu_assign_pointer(watch->watch_list, wlist);
+ rcu_read_lock();
- if (atomic_inc_return(&watch->cred->user->nr_watches) >
- task_rlimit(current, RLIMIT_NOFILE)) {
- atomic_dec(&watch->cred->user->nr_watches);
- put_cred(watch->cred);
- return -EAGAIN;
+ wqueue = rcu_access_pointer(watch->queue);
+ if (lock_wqueue(wqueue)) {
+ spin_lock(&wlist->lock);
+ ret = add_one_watch(watch, wlist, wqueue);
+ spin_unlock(&wlist->lock);
+ unlock_wqueue(wqueue);
}
- spin_lock_bh(&wqueue->lock);
- kref_get(&wqueue->usage);
- kref_get(&watch->usage);
- hlist_add_head(&watch->queue_node, &wqueue->watches);
- spin_unlock_bh(&wqueue->lock);
-
- hlist_add_head(&watch->list_node, &wlist->watchers);
- return 0;
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(add_watch_to_object);
@@ -520,20 +560,15 @@ found:
wqueue = rcu_dereference(watch->queue);
- /* We don't need the watch list lock for the next bit as RCU is
- * protecting *wqueue from deallocation.
- */
- if (wqueue) {
+ if (lock_wqueue(wqueue)) {
post_one_notification(wqueue, &n.watch);
- spin_lock_bh(&wqueue->lock);
-
if (!hlist_unhashed(&watch->queue_node)) {
hlist_del_init_rcu(&watch->queue_node);
put_watch(watch);
}
- spin_unlock_bh(&wqueue->lock);
+ unlock_wqueue(wqueue);
}
if (wlist->release_watch) {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 20a7a55e62b6..ecb0e8346e65 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -424,8 +424,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* Start period for the next softlockup warning. */
update_report_ts();
- printk_prefer_direct_enter();
-
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -444,8 +442,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
-
- printk_prefer_direct_exit();
}
return HRTIMER_RESTART;
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 701f35f0e2d4..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,8 +135,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- printk_prefer_direct_enter();
-
pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
this_cpu);
print_modules();
@@ -157,8 +155,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
- printk_prefer_direct_exit();
-
__this_cpu_write(hard_watchdog_warn, true);
return;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ea50f6be843..aa8a82bc6738 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5001,7 +5001,10 @@ static void unbind_workers(int cpu)
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);