summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/events/core.c800
-rw-r--r--kernel/events/hw_breakpoint.c124
-rw-r--r--kernel/fail_function.c10
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/locking/lockdep.c26
-rw-r--r--kernel/locking/mutex.c37
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/locking/rtmutex_common.h11
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/rwsem.h8
-rw-r--r--kernel/memremap.c1
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/rcu/rcu.h38
-rw-r--r--kernel/rcu/rcuperf.c21
-rw-r--r--kernel/rcu/rcutorture.c72
-rw-r--r--kernel/rcu/srcutree.c29
-rw-r--r--kernel/rcu/tree.c72
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_exp.h36
-rw-r--r--kernel/rcu/tree_plugin.h34
-rw-r--r--kernel/sched/debug.c29
-rw-r--r--kernel/time/Kconfig10
-rw-r--r--kernel/time/posix-timers.c11
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/trace/bpf_trace.c68
-rw-r--r--kernel/trace/trace_event_perf.c102
-rw-r--r--kernel/trace/trace_kprobe.c95
-rw-r--r--kernel/trace/trace_probe.c8
-rw-r--r--kernel/trace/trace_probe.h13
-rw-r--r--kernel/trace/trace_uprobe.c86
32 files changed, 1327 insertions, 492 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..43f95d190eea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1845,7 +1845,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
union bpf_attr attr = {};
int err;
- if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
+ if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
err = check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b838470fac4..fc1c330c6bd6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader)
{
struct perf_event *sibling;
- list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ for_each_sibling_event(sibling, leader)
perf_event_update_time(sibling);
}
@@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp_out = cpuctx->cgrp;
- if (cgrp_out)
- __update_cgrp_time(cgrp_out);
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
+ struct cgroup_subsys_state *css;
+
+ if (cgrp) {
+ for (css = &cgrp->css; css; css = css->parent) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ __update_cgrp_time(cgrp);
+ }
+ }
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
@@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
{
struct perf_cgroup *cgrp;
struct perf_cgroup_info *info;
+ struct cgroup_subsys_state *css;
/*
* ctx->lock held by caller
@@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
return;
cgrp = perf_cgroup_from_task(task, ctx);
- info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+
+ for (css = &cgrp->css; css; css = css->parent) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = ctx->timestamp;
+ }
}
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
@@ -937,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event,
if (!is_cgroup_event(event))
return;
- if (add && ctx->nr_cgroups++)
- return;
- else if (!add && --ctx->nr_cgroups)
- return;
/*
* Because cgroup events are always per-cpu events,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
- if (add) {
+
+ /*
+ * Since setting cpuctx->cgrp is conditional on the current @cgrp
+ * matching the event's cgroup, we must do this for every new event,
+ * because if the first would mismatch, the second would not try again
+ * and we would leave cpuctx->cgrp unset.
+ */
+ if (add && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
- } else {
- list_del(cpuctx_entry);
- cpuctx->cgrp = NULL;
}
+
+ if (add && ctx->nr_cgroups++)
+ return;
+ else if (!add && --ctx->nr_cgroups)
+ return;
+
+ /* no cgroup running */
+ if (!add)
+ cpuctx->cgrp = NULL;
+
+ cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+ if (add)
+ list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ else
+ list_del(cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1041,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event,
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- int rotations = 0;
+ bool rotations;
lockdep_assert_irqs_disabled();
@@ -1460,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event)
return event_type;
}
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Helper function to initialize event group nodes.
+ */
+static void init_event_group(struct perf_event *event)
+{
+ RB_CLEAR_NODE(&event->group_node);
+ event->group_index = 0;
+}
+
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits.
+ */
+static struct perf_event_groups *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
@@ -1470,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
}
/*
+ * Helper function to initializes perf_event_group trees.
+ */
+static void perf_event_groups_init(struct perf_event_groups *groups)
+{
+ groups->tree = RB_ROOT;
+ groups->index = 0;
+}
+
+/*
+ * Compare function for event groups;
+ *
+ * Implements complex key that first sorts by CPU and then by virtual index
+ * which provides ordering when rotating groups for the same CPU.
+ */
+static bool
+perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+{
+ if (left->cpu < right->cpu)
+ return true;
+ if (left->cpu > right->cpu)
+ return false;
+
+ if (left->group_index < right->group_index)
+ return true;
+ if (left->group_index > right->group_index)
+ return false;
+
+ return false;
+}
+
+/*
+ * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
+ * key (see perf_event_groups_less). This places it last inside the CPU
+ * subtree.
+ */
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ struct perf_event *node_event;
+ struct rb_node *parent;
+ struct rb_node **node;
+
+ event->group_index = ++groups->index;
+
+ node = &groups->tree.rb_node;
+ parent = *node;
+
+ while (*node) {
+ parent = *node;
+ node_event = container_of(*node, struct perf_event, group_node);
+
+ if (perf_event_groups_less(event, node_event))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&event->group_node, parent, node);
+ rb_insert_color(&event->group_node, &groups->tree);
+}
+
+/*
+ * Helper function to insert event into the pinned or flexible groups.
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event_groups *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Delete a group from a tree.
+ */
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
+ RB_EMPTY_ROOT(&groups->tree));
+
+ rb_erase(&event->group_node, &groups->tree);
+ init_event_group(event);
+}
+
+/*
+ * Helper function to delete event from its groups.
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event_groups *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Get the leftmost event in the @cpu subtree.
+ */
+static struct perf_event *
+perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+{
+ struct perf_event *node_event = NULL, *match = NULL;
+ struct rb_node *node = groups->tree.rb_node;
+
+ while (node) {
+ node_event = container_of(node, struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ match = node_event;
+ node = node->rb_left;
+ }
+ }
+
+ return match;
+}
+
+/*
+ * Like rb_entry_next_safe() for the @cpu subtree.
+ */
+static struct perf_event *
+perf_event_groups_next(struct perf_event *event)
+{
+ struct perf_event *next;
+
+ next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
+ if (next && next->cpu == event->cpu)
+ return next;
+
+ return NULL;
+}
+
+/*
+ * Iterate through the whole groups tree.
+ */
+#define perf_event_groups_for_each(event, groups) \
+ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
+ typeof(*event), group_node); event; \
+ event = rb_entry_safe(rb_next(&event->group_node), \
+ typeof(*event), group_node))
+
+/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -1489,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
- struct list_head *list;
-
event->group_caps = event->event_caps;
-
- list = ctx_group_list(event, ctx);
- list_add_tail(&event->group_entry, list);
+ add_event_to_groups(event, ctx);
}
list_update_cgroup_event(event, ctx, true);
@@ -1652,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event)
group_leader->group_caps &= event->event_caps;
- list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
perf_event__header_size(group_leader);
- list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ for_each_sibling_event(pos, group_leader)
perf_event__header_size(pos);
}
@@ -1688,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
- list_del_init(&event->group_entry);
+ del_event_from_groups(event, ctx);
/*
* If event was in error state, then keep it
@@ -1706,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
- struct list_head *list = NULL;
+ struct perf_event_context *ctx = event->ctx;
- lockdep_assert_held(&event->ctx->lock);
+ lockdep_assert_held(&ctx->lock);
/*
* We can have double detach due to exit/hot-unplug + close.
@@ -1722,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event)
* If this is a sibling, remove it from its group.
*/
if (event->group_leader != event) {
- list_del_init(&event->group_entry);
+ list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
goto out;
}
- if (!list_empty(&event->group_entry))
- list = &event->group_entry;
-
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
- list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- if (list)
- list_move_tail(&sibling->group_entry, list);
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+
sibling->group_leader = sibling;
+ list_del_init(&sibling->sibling_list);
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
+ if (!RB_EMPTY_NODE(&event->group_node)) {
+ add_event_to_groups(sibling, event->ctx);
+
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
+ struct list_head *list = sibling->attr.pinned ?
+ &ctx->pinned_active : &ctx->flexible_active;
+
+ list_add_tail(&sibling->active_list, list);
+ }
+ }
+
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
perf_event__header_size(event->group_leader);
- list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ for_each_sibling_event(tmp, event->group_leader)
perf_event__header_size(tmp);
}
@@ -1772,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event)
*/
static inline int pmu_filter_match(struct perf_event *event)
{
- struct perf_event *child;
+ struct perf_event *sibling;
if (!__pmu_filter_match(event))
return 0;
- list_for_each_entry(child, &event->sibling_list, group_entry) {
- if (!__pmu_filter_match(child))
+ for_each_sibling_event(sibling, event) {
+ if (!__pmu_filter_match(sibling))
return 0;
}
@@ -1805,6 +1995,13 @@ event_sched_out(struct perf_event *event,
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
+ /*
+ * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
+ * we can schedule events _OUT_ individually through things like
+ * __perf_remove_from_context().
+ */
+ list_del_init(&event->active_list);
+
perf_pmu_disable(event->pmu);
event->pmu->del(event, 0);
@@ -1845,7 +2042,7 @@ group_sched_out(struct perf_event *group_event,
/*
* Schedule out siblings (if any):
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ for_each_sibling_event(event, group_event)
event_sched_out(event, cpuctx, ctx);
perf_pmu_enable(ctx->pmu);
@@ -2124,7 +2321,7 @@ group_sched_in(struct perf_event *group_event,
/*
* Schedule in siblings as one group (if any):
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ for_each_sibling_event(event, group_event) {
if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
@@ -2140,7 +2337,7 @@ group_error:
* partial group before returning:
* The events up to the failed event are scheduled out normally.
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ for_each_sibling_event(event, group_event) {
if (event == partial_group)
break;
@@ -2317,6 +2514,18 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&task_ctx->lock);
}
+#ifdef CONFIG_CGROUP_PERF
+ if (is_cgroup_event(event)) {
+ /*
+ * If the current cgroup doesn't match the event's
+ * cgroup, we should not try to schedule it.
+ */
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+ reprogram = cgroup_is_descendant(cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
+ }
+#endif
+
if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
@@ -2650,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh)
}
EXPORT_SYMBOL_GPL(perf_event_refresh);
+static int perf_event_modify_breakpoint(struct perf_event *bp,
+ struct perf_event_attr *attr)
+{
+ int err;
+
+ _perf_event_disable(bp);
+
+ err = modify_user_hw_breakpoint_check(bp, attr, true);
+ if (err) {
+ if (!bp->attr.disabled)
+ _perf_event_enable(bp);
+
+ return err;
+ }
+
+ if (!attr->disabled)
+ _perf_event_enable(bp);
+ return 0;
+}
+
+static int perf_event_modify_attr(struct perf_event *event,
+ struct perf_event_attr *attr)
+{
+ if (event->attr.type != attr->type)
+ return -EINVAL;
+
+ switch (event->attr.type) {
+ case PERF_TYPE_BREAKPOINT:
+ return perf_event_modify_breakpoint(event, attr);
+ default:
+ /* Place holder for future additions. */
+ return -EOPNOTSUPP;
+ }
+}
+
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
+ struct perf_event *event, *tmp;
int is_active = ctx->is_active;
- struct perf_event *event;
lockdep_assert_held(&ctx->lock);
@@ -2702,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
- list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
group_sched_out(event, cpuctx, ctx);
}
if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
group_sched_out(event, cpuctx, ctx);
}
perf_pmu_enable(ctx->pmu);
@@ -2994,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
+ int (*func)(struct perf_event *, void *), void *data)
{
- struct perf_event *event;
+ struct perf_event **evt, *evt1, *evt2;
+ int ret;
- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- if (!event_filter_match(event))
- continue;
+ evt1 = perf_event_groups_first(groups, -1);
+ evt2 = perf_event_groups_first(groups, cpu);
+
+ while (evt1 || evt2) {
+ if (evt1 && evt2) {
+ if (evt1->group_index < evt2->group_index)
+ evt = &evt1;
+ else
+ evt = &evt2;
+ } else if (evt1) {
+ evt = &evt1;
+ } else {
+ evt = &evt2;
+ }
- if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx);
+ ret = func(*evt, data);
+ if (ret)
+ return ret;
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ *evt = perf_event_groups_next(*evt);
}
+
+ return 0;
+}
+
+struct sched_in_data {
+ struct perf_event_context *ctx;
+ struct perf_cpu_context *cpuctx;
+ int can_add_hw;
+};
+
+static int pinned_sched_in(struct perf_event *event, void *data)
+{
+ struct sched_in_data *sid = data;
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ if (!event_filter_match(event))
+ return 0;
+
+ if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+ if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+ list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+ }
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ return 0;
+}
+
+static int flexible_sched_in(struct perf_event *event, void *data)
+{
+ struct sched_in_data *sid = data;
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ if (!event_filter_match(event))
+ return 0;
+
+ if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+ if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+ list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+ else
+ sid->can_add_hw = 0;
+ }
+
+ return 0;
+}
+
+static void
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct sched_in_data sid = {
+ .ctx = ctx,
+ .cpuctx = cpuctx,
+ .can_add_hw = 1,
+ };
+
+ visit_groups_merge(&ctx->pinned_groups,
+ smp_processor_id(),
+ pinned_sched_in, &sid);
}
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct perf_event *event;
- int can_add_hw = 1;
-
- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- /* Ignore events in OFF or ERROR state */
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- /*
- * Listen to the 'cpu' scheduling filter constraint
- * of events:
- */
- if (!event_filter_match(event))
- continue;
+ struct sched_in_data sid = {
+ .ctx = ctx,
+ .cpuctx = cpuctx,
+ .can_add_hw = 1,
+ };
- if (group_can_go_on(event, cpuctx, can_add_hw)) {
- if (group_sched_in(event, cpuctx, ctx))
- can_add_hw = 0;
- }
- }
+ visit_groups_merge(&ctx->flexible_groups,
+ smp_processor_id(),
+ flexible_sched_in, &sid);
}
static void
@@ -3121,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!list_empty(&ctx->pinned_groups))
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
@@ -3350,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
}
/*
- * Round-robin a context's events:
+ * Move @event to the tail of the @ctx's elegible events.
*/
-static void rotate_ctx(struct perf_event_context *ctx)
+static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable)
- list_rotate_left(&ctx->flexible_groups);
+ if (ctx->rotate_disable)
+ return;
+
+ perf_event_groups_delete(&ctx->flexible_groups, event);
+ perf_event_groups_insert(&ctx->flexible_groups, event);
+}
+
+static inline struct perf_event *
+ctx_first_active(struct perf_event_context *ctx)
+{
+ return list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
}
-static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
+ struct perf_event *cpu_event = NULL, *task_event = NULL;
+ bool cpu_rotate = false, task_rotate = false;
struct perf_event_context *ctx = NULL;
- int rotate = 0;
+
+ /*
+ * Since we run this from IRQ context, nobody can install new
+ * events, thus the event count values are stable.
+ */
if (cpuctx->ctx.nr_events) {
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- rotate = 1;
+ cpu_rotate = true;
}
ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
if (ctx->nr_events != ctx->nr_active)
- rotate = 1;
+ task_rotate = true;
}
- if (!rotate)
- goto done;
+ if (!(cpu_rotate || task_rotate))
+ return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- if (ctx)
+ if (task_rotate)
+ task_event = ctx_first_active(ctx);
+ if (cpu_rotate)
+ cpu_event = ctx_first_active(&cpuctx->ctx);
+
+ /*
+ * As per the order given at ctx_resched() first 'pop' task flexible
+ * and then, if needed CPU flexible.
+ */
+ if (task_event || (ctx && cpu_event))
ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (cpu_event)
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- rotate_ctx(&cpuctx->ctx);
- if (ctx)
- rotate_ctx(ctx);
+ if (task_event)
+ rotate_ctx(ctx, task_event);
+ if (cpu_event)
+ rotate_ctx(&cpuctx->ctx, cpu_event);
perf_event_sched_in(cpuctx, ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-done:
- return rotate;
+ return true;
}
void perf_event_task_tick(void)
@@ -3543,7 +3876,7 @@ static void __perf_event_read(void *info)
pmu->read(event);
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ for_each_sibling_event(sub, event) {
if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/*
* Use sibling's PMU rather than @event's since
@@ -3717,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
- INIT_LIST_HEAD(&ctx->pinned_groups);
- INIT_LIST_HEAD(&ctx->flexible_groups);
+ perf_event_groups_init(&ctx->pinned_groups);
+ perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
+ INIT_LIST_HEAD(&ctx->pinned_active);
+ INIT_LIST_HEAD(&ctx->flexible_active);
atomic_set(&ctx->refcount, 1);
}
@@ -4389,7 +4724,7 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
@@ -4583,7 +4918,7 @@ static void perf_event_for_each(struct perf_event *event,
event = event->group_leader;
perf_event_for_each_child(event, func);
- list_for_each_entry(sibling, &event->sibling_list, group_entry)
+ for_each_sibling_event(sibling, event)
perf_event_for_each_child(sibling, func);
}
@@ -4665,6 +5000,8 @@ static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+ struct perf_event_attr *attr);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -4737,6 +5074,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_QUERY_BPF:
return perf_event_query_prog_array(event, (void __user *)arg);
+
+ case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
+ struct perf_event_attr new_attr;
+ int err = perf_copy_attr((struct perf_event_attr __user *)arg,
+ &new_attr);
+
+ if (err)
+ return err;
+
+ return perf_event_modify_attr(event, &new_attr);
+ }
default:
return -ENOTTY;
}
@@ -5732,7 +6080,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if (leader != event)
+ if ((leader != event) &&
+ (leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
values[n++] = perf_event_count(leader);
@@ -5741,7 +6090,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ for_each_sibling_event(sub, leader) {
n = 0;
if ((sub != event) &&
@@ -7998,9 +8347,119 @@ static struct pmu perf_tracepoint = {
.read = perf_swevent_read,
};
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+/*
+ * Flags in config, used by dynamic PMU kprobe and uprobe
+ * The flags should match following PMU_FORMAT_ATTR().
+ *
+ * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
+ * if not set, create kprobe/uprobe
+ */
+enum perf_probe_config {
+ PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
+};
+
+PMU_FORMAT_ATTR(retprobe, "config:0");
+
+static struct attribute *probe_attrs[] = {
+ &format_attr_retprobe.attr,
+ NULL,
+};
+
+static struct attribute_group probe_format_group = {
+ .name = "format",
+ .attrs = probe_attrs,
+};
+
+static const struct attribute_group *probe_attr_groups[] = {
+ &probe_format_group,
+ NULL,
+};
+#endif
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int perf_kprobe_event_init(struct perf_event *event);
+static struct pmu perf_kprobe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_kprobe_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+ .attr_groups = probe_attr_groups,
+};
+
+static int perf_kprobe_event_init(struct perf_event *event)
+{
+ int err;
+ bool is_retprobe;
+
+ if (event->attr.type != perf_kprobe.type)
+ return -ENOENT;
+ /*
+ * no branch sampling for probe events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ err = perf_kprobe_init(event, is_retprobe);
+ if (err)
+ return err;
+
+ event->destroy = perf_kprobe_destroy;
+
+ return 0;
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
+#ifdef CONFIG_UPROBE_EVENTS
+static int perf_uprobe_event_init(struct perf_event *event);
+static struct pmu perf_uprobe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_uprobe_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+ .attr_groups = probe_attr_groups,
+};
+
+static int perf_uprobe_event_init(struct perf_event *event)
+{
+ int err;
+ bool is_retprobe;
+
+ if (event->attr.type != perf_uprobe.type)
+ return -ENOENT;
+ /*
+ * no branch sampling for probe events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ err = perf_uprobe_init(event, is_retprobe);
+ if (err)
+ return err;
+
+ event->destroy = perf_uprobe_destroy;
+
+ return 0;
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
static inline void perf_tp_register(void)
{
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+#ifdef CONFIG_KPROBE_EVENTS
+ perf_pmu_register(&perf_kprobe, "kprobe", -1);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ perf_pmu_register(&perf_uprobe, "uprobe", -1);
+#endif
}
static void perf_event_free_filter(struct perf_event *event)
@@ -8077,13 +8536,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
}
#endif
+/*
+ * returns true if the event is a tracepoint, or a kprobe/upprobe created
+ * with perf_event_open()
+ */
+static inline bool perf_event_is_tracing(struct perf_event *event)
+{
+ if (event->pmu == &perf_tracepoint)
+ return true;
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->pmu == &perf_kprobe)
+ return true;
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->pmu == &perf_uprobe)
+ return true;
+#endif
+ return false;
+}
+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
int ret;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog_fd);
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
@@ -8129,7 +8607,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
return;
}
@@ -8325,7 +8803,8 @@ restart:
* * for kernel addresses: <start address>[/<size>]
* * for object files: <start address>[/<size>]@</path/to/object/file>
*
- * if <size> is not specified, the range is treated as a single address.
+ * if <size> is not specified or is zero, the range is treated as a single
+ * address; not valid for ACTION=="filter".
*/
enum {
IF_ACT_NONE = -1,
@@ -8375,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
return -ENOMEM;
while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ static const enum perf_addr_filter_action_t actions[] = {
+ [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
+ [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
+ [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
+ };
ret = -EINVAL;
if (!*start)
@@ -8391,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
switch (token) {
case IF_ACT_FILTER:
case IF_ACT_START:
- filter->filter = 1;
-
case IF_ACT_STOP:
if (state != IF_STATE_ACTION)
goto fail;
+ filter->action = actions[token];
state = IF_STATE_SOURCE;
break;
@@ -8409,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (state != IF_STATE_SOURCE)
goto fail;
- if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
- filter->range = 1;
-
*args[0].to = 0;
ret = kstrtoul(args[0].from, 0, &filter->offset);
if (ret)
goto fail;
- if (filter->range) {
+ if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
*args[1].to = 0;
ret = kstrtoul(args[1].from, 0, &filter->size);
if (ret)
@@ -8425,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
- int fpos = filter->range ? 2 : 1;
+ int fpos = token == IF_SRC_FILE ? 2 : 1;
filename = match_strdup(&args[fpos]);
if (!filename) {
@@ -8451,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (kernel && event->attr.exclude_kernel)
goto fail;
+ /*
+ * ACTION "filter" must have a non-zero length region
+ * specified.
+ */
+ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
+ !filter->size)
+ goto fail;
+
if (!kernel) {
if (!filename)
goto fail;
@@ -8548,47 +9036,36 @@ fail_clear_files:
return ret;
}
-static int
-perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
-{
- struct perf_event_context *ctx = event->ctx;
- int ret;
-
- /*
- * Beware, here be dragons!!
- *
- * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
- * stuff does not actually need it. So temporarily drop ctx->mutex. As per
- * perf_event_ctx_lock() we already have a reference on ctx.
- *
- * This can result in event getting moved to a different ctx, but that
- * does not affect the tracepoint state.
- */
- mutex_unlock(&ctx->mutex);
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
- mutex_lock(&ctx->mutex);
-
- return ret;
-}
-
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
- char *filter_str;
int ret = -EINVAL;
-
- if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
- !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
- !has_addr_filter(event))
- return -EINVAL;
+ char *filter_str;
filter_str = strndup_user(arg, PAGE_SIZE);
if (IS_ERR(filter_str))
return PTR_ERR(filter_str);
- if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
- event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = perf_tracepoint_set_filter(event, filter_str);
- else if (has_addr_filter(event))
+#ifdef CONFIG_EVENT_TRACING
+ if (perf_event_is_tracing(event)) {
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but
+ * the tracepoint stuff does not actually need it. So
+ * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
+ * already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx,
+ * but that does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+ } else
+#endif
+ if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
kfree(filter_str);
@@ -9441,9 +9918,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->child_mutex);
INIT_LIST_HEAD(&event->child_list);
- INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->active_list);
+ init_event_group(event);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
@@ -9718,6 +10196,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
ret = -EINVAL;
}
+ if (!attr->sample_max_stack)
+ attr->sample_max_stack = sysctl_perf_event_max_stack;
+
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
out:
@@ -9931,9 +10412,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
- if (!attr.sample_max_stack)
- attr.sample_max_stack = sysctl_perf_event_max_stack;
-
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -10207,8 +10685,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_remove_from_context(group_leader, 0);
put_ctx(gctx);
- list_for_each_entry(sibling, &group_leader->sibling_list,
- group_entry) {
+ for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
@@ -10229,8 +10706,7 @@ SYSCALL_DEFINE5(perf_event_open,
* By installing siblings first we NO-OP because they're not
* reachable through the group lists.
*/
- list_for_each_entry(sibling, &group_leader->sibling_list,
- group_entry) {
+ for_each_sibling_event(sibling, group_leader) {
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
@@ -10869,7 +11345,7 @@ static int inherit_group(struct perf_event *parent_event,
* case inherit_event() will create individual events, similar to what
* perf_group_detach() would do anyway.
*/
- list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+ for_each_sibling_event(sub, parent_event) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
if (IS_ERR(child_ctr))
@@ -10968,7 +11444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
- list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
@@ -10984,7 +11460,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
- list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+ perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3f8cb1e14588..6e28d2866be5 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -44,6 +44,7 @@
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/bug.h>
#include <linux/hw_breakpoint.h>
/*
@@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp)
return 1;
}
-static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
- if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+ if (bp_type & HW_BREAKPOINT_RW)
return TYPE_DATA;
return TYPE_INST;
@@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
if (iter->hw.target == tsk &&
- find_slot_idx(iter) == type &&
+ find_slot_idx(iter->attr.bp_type) == type &&
(iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
@@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
* ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
* + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
-static int __reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
struct bp_busy_slots slots = {0};
enum bp_type_idx type;
@@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp)
return -ENOMEM;
/* Basic checks */
- if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
- bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+ if (bp_type == HW_BREAKPOINT_EMPTY ||
+ bp_type == HW_BREAKPOINT_INVALID)
return -EINVAL;
- type = find_slot_idx(bp);
+ type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
fetch_bp_busy_slots(&slots, bp, type);
@@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp)
mutex_lock(&nr_bp_mutex);
- ret = __reserve_bp_slot(bp);
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
mutex_unlock(&nr_bp_mutex);
return ret;
}
-static void __release_bp_slot(struct perf_event *bp)
+static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
{
enum bp_type_idx type;
int weight;
- type = find_slot_idx(bp);
+ type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
toggle_bp_slot(bp, false, type, weight);
}
@@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp)
mutex_lock(&nr_bp_mutex);
arch_unregister_hw_breakpoint(bp);
- __release_bp_slot(bp);
+ __release_bp_slot(bp, bp->attr.bp_type);
mutex_unlock(&nr_bp_mutex);
}
+static int __modify_bp_slot(struct perf_event *bp, u64 old_type)
+{
+ int err;
+
+ __release_bp_slot(bp, old_type);
+
+ err = __reserve_bp_slot(bp, bp->attr.bp_type);
+ if (err) {
+ /*
+ * Reserve the old_type slot back in case
+ * there's no space for the new type.
+ *
+ * This must succeed, because we just released
+ * the old_type slot in the __release_bp_slot
+ * call above. If not, something is broken.
+ */
+ WARN_ON(__reserve_bp_slot(bp, old_type));
+ }
+
+ return err;
+}
+
+static int modify_bp_slot(struct perf_event *bp, u64 old_type)
+{
+ int ret;
+
+ mutex_lock(&nr_bp_mutex);
+ ret = __modify_bp_slot(bp, old_type);
+ mutex_unlock(&nr_bp_mutex);
+ return ret;
+}
+
/*
* Allow the kernel debugger to reserve breakpoint slots without
* taking a lock using the dbg_* variant of for the reserve and
@@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp)
if (mutex_is_locked(&nr_bp_mutex))
return -1;
- return __reserve_bp_slot(bp);
+ return __reserve_bp_slot(bp, bp->attr.bp_type);
}
int dbg_release_bp_slot(struct perf_event *bp)
@@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp)
if (mutex_is_locked(&nr_bp_mutex))
return -1;
- __release_bp_slot(bp);
+ __release_bp_slot(bp, bp->attr.bp_type);
return 0;
}
@@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+int
+modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
+ bool check)
+{
+ u64 old_addr = bp->attr.bp_addr;
+ u64 old_len = bp->attr.bp_len;
+ int old_type = bp->attr.bp_type;
+ bool modify = attr->bp_type != old_type;
+ int err = 0;
+
+ bp->attr.bp_addr = attr->bp_addr;
+ bp->attr.bp_type = attr->bp_type;
+ bp->attr.bp_len = attr->bp_len;
+
+ if (check && memcmp(&bp->attr, attr, sizeof(*attr)))
+ return -EINVAL;
+
+ err = validate_hw_breakpoint(bp);
+ if (!err && modify)
+ err = modify_bp_slot(bp, old_type);
+
+ if (err) {
+ bp->attr.bp_addr = old_addr;
+ bp->attr.bp_type = old_type;
+ bp->attr.bp_len = old_len;
+ return err;
+ }
+
+ bp->attr.disabled = attr->disabled;
+ return 0;
+}
+
/**
* modify_user_hw_breakpoint - modify a user-space hardware breakpoint
* @bp: the breakpoint structure to modify
* @attr: new breakpoint attributes
- * @triggered: callback to trigger when we hit the breakpoint
- * @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
- u64 old_addr = bp->attr.bp_addr;
- u64 old_len = bp->attr.bp_len;
- int old_type = bp->attr.bp_type;
- int err = 0;
-
/*
* modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
* will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
else
perf_event_disable(bp);
- bp->attr.bp_addr = attr->bp_addr;
- bp->attr.bp_type = attr->bp_type;
- bp->attr.bp_len = attr->bp_len;
-
- if (attr->disabled)
- goto end;
+ if (!attr->disabled) {
+ int err = modify_user_hw_breakpoint_check(bp, attr, false);
- err = validate_hw_breakpoint(bp);
- if (!err)
+ if (err)
+ return err;
perf_event_enable(bp);
-
- if (err) {
- bp->attr.bp_addr = old_addr;
- bp->attr.bp_type = old_type;
- bp->attr.bp_len = old_len;
- if (!bp->attr.disabled)
- perf_event_enable(bp);
-
- return err;
+ bp->attr.disabled = 0;
}
-
-end:
- bp->attr.disabled = attr->disabled;
-
return 0;
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 21b0122cb39c..1d5632d8bbcc 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -14,6 +14,15 @@
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs,
+ unsigned long flags)
+{
+ /*
+ * A dummy post handler is required to prohibit optimizing, because
+ * jump optimization does not support execution path overriding.
+ */
+}
+
struct fei_attr {
struct list_head list;
struct kprobe kp;
@@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
return NULL;
}
attr->kp.pre_handler = fei_kprobe_handler;
+ attr->kp.post_handler = fei_post_handler;
attr->retval = adjust_error_retval(addr, 0);
INIT_LIST_HEAD(&attr->list);
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index e7214093dcd1..01ebdf1f9f40 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,6 +16,7 @@
#include <linux/jump_label_ratelimit.h>
#include <linux/bug.h>
#include <linux/cpu.h>
+#include <asm/sections.h>
#ifdef HAVE_JUMP_LABEL
@@ -421,15 +422,15 @@ void __init jump_label_init(void)
cpus_read_unlock();
}
-/* Disable any jump label entries in __init code */
-void __init jump_label_invalidate_init(void)
+/* Disable any jump label entries in __init/__exit code */
+void __init jump_label_invalidate_initmem(void)
{
struct jump_entry *iter_start = __start___jump_table;
struct jump_entry *iter_stop = __stop___jump_table;
struct jump_entry *iter;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (init_kernel_text(iter->code))
+ if (init_section_contains((void *)(unsigned long)iter->code, 1))
iter->code = 0;
}
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 89b5f83f1969..023386338269 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock)
return;
}
+ printk(KERN_CONT "%p", hlock->instance);
print_lock_name(lock_classes + class_idx - 1);
- printk(KERN_CONT ", at: [<%p>] %pS\n",
- (void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
+ printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
static void lockdep_print_held_locks(struct task_struct *curr)
@@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (verbose(class)) {
graph_unlock();
- printk("\nnew class %p: %s", class->key, class->name);
+ printk("\nnew class %px: %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
@@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
}
printk("%*s }\n", depth, "");
- printk("%*s ... key at: [<%p>] %pS\n",
+ printk("%*s ... key at: [<%px>] %pS\n",
depth, "", class->key, class->key);
}
@@ -2340,7 +2340,7 @@ cache_hit:
if (very_verbose(class)) {
printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
+ "%016Lx tail class: [%px] %s\n",
(unsigned long long)chain_key,
class->key, class->name);
}
@@ -2349,7 +2349,7 @@ cache_hit:
}
if (very_verbose(class)) {
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
(unsigned long long)chain_key, class->key, class->name);
}
@@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
void print_irqtrace_events(struct task_struct *curr)
{
printk("irq event stamp: %u\n", curr->irq_events);
- printk("hardirqs last enabled at (%u): [<%p>] %pS\n",
+ printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
(void *)curr->hardirq_enable_ip);
- printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+ printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
(void *)curr->hardirq_disable_ip);
- printk("softirqs last enabled at (%u): [<%p>] %pS\n",
+ printk("softirqs last enabled at (%u): [<%px>] %pS\n",
curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
(void *)curr->softirq_enable_ip);
- printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+ printk("softirqs last disabled at (%u): [<%px>] %pS\n",
curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
(void *)curr->softirq_disable_ip);
}
@@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
* Sanity check, the lock-class key must be persistent:
*/
if (!static_obj(key)) {
- printk("BUG: key %p not in .data!\n", key);
+ printk("BUG: key %px not in .data!\n", key);
/*
* What it says above ^^^^^, I suggest you read it.
*/
@@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
atomic_inc((atomic_t *)&class->ops);
if (very_verbose(class)) {
- printk("\nacquire class [%p] %s", class->key, class->name);
+ printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
@@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
pr_warn("-------------------------\n");
- pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
lockdep_print_held_locks(curr);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 858a07590e39..2048359f33d2 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1082,15 +1082,16 @@ static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock);
/**
- * mutex_lock_interruptible - acquire the mutex, interruptible
- * @lock: the mutex to be acquired
+ * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
+ * @lock: The mutex to be acquired.
*
- * Lock the mutex like mutex_lock(), and return 0 if the mutex has
- * been acquired or sleep until the mutex becomes available. If a
- * signal arrives while waiting for the lock then this function
- * returns -EINTR.
+ * Lock the mutex like mutex_lock(). If a signal is delivered while the
+ * process is sleeping, this function will return without acquiring the
+ * mutex.
*
- * This function is similar to (but not equivalent to) down_interruptible().
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * signal arrived.
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
@@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock_interruptible);
+/**
+ * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). If a signal which will be fatal to
+ * the current process is delivered while the process is sleeping, this
+ * function will return without acquiring the mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * fatal signal arrived.
+ */
int __sched mutex_lock_killable(struct mutex *lock)
{
might_sleep();
@@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_lock_killable);
+/**
+ * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). While the task is waiting for this
+ * mutex, it will be accounted as being in the IO wait state by the
+ * scheduler.
+ *
+ * Context: Process context.
+ */
void __sched mutex_lock_io(struct mutex *lock)
{
int token;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 940633c63254..4f014be7a4b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (unlikely(ret)) {
__set_current_state(TASK_RUNNING);
- if (rt_mutex_has_waiters(lock))
- remove_waiter(lock, &waiter);
+ remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, chwalk, &waiter);
}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 68686b3ec3c1..d1d62f942be2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
static inline struct rt_mutex_waiter *
rt_mutex_top_waiter(struct rt_mutex *lock)
{
- struct rt_mutex_waiter *w;
-
- w = rb_entry(lock->waiters.rb_leftmost,
- struct rt_mutex_waiter, tree_entry);
- BUG_ON(w->lock != lock);
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+ struct rt_mutex_waiter *w = NULL;
+ if (leftmost) {
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ BUG_ON(w->lock != lock);
+ }
return w;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f549c552dbf1..30465a2f2b6c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock);
void up_read(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
__up_read(sem);
}
@@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read);
void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current);
rwsem_clear_owner(sem);
__up_write(sem);
@@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write);
void downgrade_write(struct rw_semaphore *sem)
{
lock_downgrade(&sem->dep_map, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current);
rwsem_set_reader_owned(sem);
__downgrade_write(sem);
@@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
{
+ DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
__up_read(sem);
}
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a883b8f1fdc6..a17cba8d94bb 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -16,6 +16,12 @@
*/
#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c)
+#endif
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
* do a write to the rwsem cacheline when it is really necessary
* to minimize cacheline contention.
*/
- if (sem->owner != RWSEM_READER_OWNED)
+ if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED)
WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 4dd4274cabe2..895e6b76b25e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -427,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_pfn_remap:
err_radix:
pgmap_radix_release(res, pgoff);
- devres_free(pgmap);
return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
diff --git a/kernel/module.c b/kernel/module.c
index ad2d420024f6..e42764acedb4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4228,7 +4228,7 @@ static int modules_open(struct inode *inode, struct file *file)
m->private = kallsyms_show_value() ? NULL : (void *)8ul;
}
- return 0;
+ return err;
}
static const struct file_operations proc_modules_operations = {
diff --git a/kernel/panic.c b/kernel/panic.c
index 4b794f1d8561..9d833d913c84 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -289,7 +289,7 @@ void panic(const char *fmt, ...)
disabled_wait(caller);
}
#endif
- pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
+ pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 6334f2c1abd0..7a693e31184a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp)
WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
}
+/* Compute the end-of-grace-period value for the specified sequence number. */
+static inline unsigned long rcu_seq_endval(unsigned long *sp)
+{
+ return (*sp | RCU_SEQ_STATE_MASK) + 1;
+}
+
/* Adjust sequence number for end of update-side operation. */
static inline void rcu_seq_end(unsigned long *sp)
{
smp_mb(); /* Ensure update-side operation before counter increment. */
WARN_ON_ONCE(!rcu_seq_state(*sp));
- WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
+ WRITE_ONCE(*sp, rcu_seq_endval(sp));
}
/* Take a snapshot of the update side's sequence number. */
@@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* Iterate over all possible CPUs in a leaf RCU node.
*/
#define for_each_leaf_node_possible_cpu(rnp, cpu) \
- for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
- cpu <= rnp->grphi; \
- cpu = cpumask_next((cpu), cpu_possible_mask))
+ for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+ (cpu) <= rnp->grphi; \
+ (cpu) = cpumask_next((cpu), cpu_possible_mask))
+
+/*
+ * Iterate over all CPUs in a leaf RCU node's specified mask.
+ */
+#define rcu_find_next_bit(rnp, cpu, mask) \
+ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
+#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
+ for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+ (cpu) <= rnp->grphi; \
+ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
/*
* Wrappers for the rcu_node::lock acquire and release.
@@ -337,7 +353,7 @@ do { \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
#define raw_spin_trylock_rcu_node(p) \
({ \
@@ -348,6 +364,9 @@ do { \
___locked; \
})
+#define raw_lockdep_assert_held_rcu_node(p) \
+ lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
+
#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
#ifdef CONFIG_TINY_RCU
@@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
void rcupdate_announce_bootup_oddness(void);
+void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
-#ifdef CONFIG_TINY_RCU
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
-#else /* #ifdef CONFIG_TINY_RCU */
-void rcu_request_urgent_qs_task(struct task_struct *t);
-#endif /* #else #ifdef CONFIG_TINY_RCU */
-
enum rcutorture_type {
RCU_FLAVOR,
RCU_BH_FLAVOR,
@@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void);
void rcu_force_quiescent_state(void);
void rcu_bh_force_quiescent_state(void);
void rcu_sched_force_quiescent_state(void);
+extern struct workqueue_struct *rcu_gp_wq;
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d1ebdf9868bb..777e7a6a0292 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+/*
+ * The intended use cases for the nreaders and nwriters module parameters
+ * are as follows:
+ *
+ * 1. Specify only the nr_cpus kernel boot parameter. This will
+ * set both nreaders and nwriters to the value specified by
+ * nr_cpus for a mixed reader/writer test.
+ *
+ * 2. Specify the nr_cpus kernel boot parameter, but set
+ * rcuperf.nreaders to zero. This will set nwriters to the
+ * value specified by nr_cpus for an update-only test.
+ *
+ * 3. Specify the nr_cpus kernel boot parameter, but set
+ * rcuperf.nwriters to zero. This will set nreaders to the
+ * value specified by nr_cpus for a read-only test.
+ *
+ * Various other use cases may of course be specified.
+ */
+
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, 0, "Number of RCU reader threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
torture_param(bool, shutdown, !IS_ENABLED(MODULE),
"Shutdown at end of performance tests.");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 308e6fdbced8..680c96d8c00f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -909,34 +909,38 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- if (!can_expedite) {
+ if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
- " GP expediting controlled from boot/sysfs for %s,\n",
+ " GP expediting controlled from boot/sysfs for %s.\n",
torture_type, cur_ops->name);
- pr_alert("%s" TORTURE_FLAG
- " Disabled dynamic grace-period expediting.\n",
- torture_type);
- }
/* Initialize synctype[] array. If none set, take default. */
if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
- else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
- pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
- if (gp_exp1 && cur_ops->exp_sync)
+ pr_info("%s: Testing conditional GPs.\n", __func__);
+ } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
+ pr_alert("%s: gp_cond without primitives.\n", __func__);
+ }
+ if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
- else if (gp_exp && !cur_ops->exp_sync)
- pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
- if (gp_normal1 && cur_ops->deferred_free)
+ pr_info("%s: Testing expedited GPs.\n", __func__);
+ } else if (gp_exp && !cur_ops->exp_sync) {
+ pr_alert("%s: gp_exp without primitives.\n", __func__);
+ }
+ if (gp_normal1 && cur_ops->deferred_free) {
synctype[nsynctypes++] = RTWS_DEF_FREE;
- else if (gp_normal && !cur_ops->deferred_free)
- pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
- if (gp_sync1 && cur_ops->sync)
+ pr_info("%s: Testing asynchronous GPs.\n", __func__);
+ } else if (gp_normal && !cur_ops->deferred_free) {
+ pr_alert("%s: gp_normal without primitives.\n", __func__);
+ }
+ if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
- else if (gp_sync && !cur_ops->sync)
- pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ pr_info("%s: Testing normal GPs.\n", __func__);
+ } else if (gp_sync && !cur_ops->sync) {
+ pr_alert("%s: gp_sync without primitives.\n", __func__);
+ }
if (WARN_ONCE(nsynctypes == 0,
"rcu_torture_writer: No update-side primitives.\n")) {
/*
@@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg)
rcu_unexpedite_gp();
if (++expediting > 3)
expediting = -expediting;
+ } else if (!can_expedite) { /* Disabled during boot, recheck. */
+ can_expedite = !rcu_gp_is_expedited() &&
+ !rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
stutter_wait("rcu_torture_writer");
@@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg)
while (can_expedite && expediting++ < 0)
rcu_unexpedite_gp();
WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " Dynamic grace-period expediting was disabled.\n",
+ torture_type);
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg)
torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else if (gp_normal == gp_exp) {
- if (torture_random(&rand) & 0x80)
+ if (cur_ops->sync && torture_random(&rand) & 0x80)
cur_ops->sync();
- else
+ else if (cur_ops->exp_sync)
cur_ops->exp_sync();
- } else if (gp_normal) {
+ } else if (gp_normal && cur_ops->sync) {
cur_ops->sync();
- } else {
+ } else if (cur_ops->exp_sync) {
cur_ops->exp_sync();
}
stutter_wait("rcu_torture_fakewriter");
@@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void)
atomic_set(&barrier_cbs_count, 0);
atomic_set(&barrier_cbs_invoked, 0);
barrier_cbs_tasks =
- kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]),
GFP_KERNEL);
barrier_cbs_wq =
- kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
- GFP_KERNEL);
+ kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL);
if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
@@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
* next grace period. Unlikely, but can happen. If it
* does happen, the debug-objects subsystem won't have splatted.
*/
- pr_alert("rcutorture: duplicated callback was invoked.\n");
+ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void)
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+ pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
preempt_disable(); /* Prevent preemption from interrupting test. */
@@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void)
/* Wait for them all to get done so we can safely return. */
rcu_barrier();
- pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
@@ -1799,7 +1809,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
if (nfakewriters > 0) {
- fakewriter_tasks = kzalloc(nfakewriters *
+ fakewriter_tasks = kcalloc(nfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
@@ -1814,7 +1824,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
}
- reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("out of memory");
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d5cea81378cc..fb560fca9ef4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(sp))) {
- pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
return; /* Caller forgot to stop doing call_srcu()? */
}
free_percpu(sp->sda);
@@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
struct srcu_data *sdp = this_cpu_ptr(sp->sda);
int state;
- lockdep_assert_held(&sp->lock);
+ lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
- &sdp->work, delay);
+ srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
}
/*
@@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
{
unsigned long cbdelay;
bool cbs;
+ bool last_lvl;
int cpu;
unsigned long flags;
unsigned long gpseq;
int idx;
- int idxnext;
unsigned long mask;
struct srcu_data *sdp;
struct srcu_node *snp;
@@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
/* Initiate callback invocation as needed. */
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
rcu_for_each_node_breadth_first(sp, snp) {
spin_lock_irq_rcu_node(snp);
cbs = false;
- if (snp >= sp->level[rcu_num_lvls - 1])
+ last_lvl = snp >= sp->level[rcu_num_lvls - 1];
+ if (last_lvl)
cbs = snp->srcu_have_cbs[idx] == gpseq;
snp->srcu_have_cbs[idx] = gpseq;
rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
@@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp)
srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
/* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check))
+ if (!(gpseq & counter_wrap_check) && last_lvl)
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
sdp = per_cpu_ptr(sp->sda, cpu);
spin_lock_irqsave_rcu_node(sdp, flags);
if (ULONG_CMP_GE(gpseq,
sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq,
+ sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
spin_unlock_irqrestore_rcu_node(sdp, flags);
}
}
@@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
srcu_gp_start(sp);
spin_unlock_irq_rcu_node(sp);
- /* Throttle expedited grace periods: Should be rare! */
- srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
- ? 0 : SRCU_INTERVAL);
+ srcu_reschedule(sp, 0);
} else {
spin_unlock_irq_rcu_node(sp);
}
@@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
spin_unlock_irqrestore_rcu_node(snp, flags);
}
spin_lock_irqsave_rcu_node(sp, flags);
- if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+ if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
sp->srcu_gp_seq_needed_exp = s;
spin_unlock_irqrestore_rcu_node(sp, flags);
}
@@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
srcu_gp_start(sp);
- queue_delayed_work(system_power_efficient_wq, &sp->work,
- srcu_get_delay(sp));
+ queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp));
}
spin_unlock_irqrestore_rcu_node(sp, flags);
}
@@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
spin_unlock_irq_rcu_node(sp);
if (pushgp)
- queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &sp->work, delay);
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 491bdf39f276..2a734692a581 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
*/
static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
WRITE_ONCE(rdp->gpwrap, true);
if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
@@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
+ pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
}
@@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void)
static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
struct rcu_node *rnp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/*
* If RCU is idle, we just wait for the next grace period.
@@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/*
* Pick up grace-period number for new callbacks. If this
@@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
{
bool ret = false;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
bool need_gp;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
@@ -2296,7 +2297,7 @@ static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
@@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
- lockdep_assert_held(&rcu_get_root(rsp)->lock);
+ raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp));
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
unsigned long oldmask = 0;
struct rcu_node *rnp_c;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* Walk up the rcu_node hierarchy. */
for (;;) {
@@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
unsigned long mask;
struct rcu_node *rnp_p;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
@@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
smp_mb(); /* List handling before counting for rcu_barrier(). */
- rdp->n_cbs_invoked += count;
rcu_segcblist_insert_count(&rdp->cblist, &rcl);
/* Reinstate batch limit if we have worked down the excess. */
@@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp)
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
- if (ret) {
- rsp->n_force_qs_lh++;
+ if (ret)
return;
- }
rnp_old = rnp;
}
/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
@@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
raw_spin_unlock(&rnp_old->fqslock);
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
@@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
{
struct rcu_node *rnp = rdp->mynode;
- rdp->n_rcu_pending++;
-
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rsp, rdp);
@@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rcu_scheduler_fully_active &&
- rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
- rdp->n_rp_core_needs_qs++;
- } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
- rdp->n_rp_report_qs++;
+ if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
return 1;
- }
/* Does this CPU have callbacks ready to invoke? */
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rdp->n_rp_cb_ready++;
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
- }
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rdp->n_rp_cpu_needs_gp++;
+ if (cpu_needs_another_gp(rsp, rdp))
return 1;
- }
/* Has another RCU grace period completed? */
- if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
- rdp->n_rp_gp_completed++;
+ if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */
return 1;
- }
/* Has a new RCU grace period started? */
if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
- unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
- rdp->n_rp_gp_started++;
+ unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
return 1;
- }
/* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp)) {
- rdp->n_rp_nocb_defer_wakeup++;
+ if (rcu_nocb_need_deferred_wakeup(rdp))
return 1;
- }
/* nothing to do */
- rdp->n_rp_need_nothing++;
return 0;
}
@@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
for (;;) {
mask = rnp->grpmask;
rnp = rnp->parent;
@@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
@@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
pr_cont("\n");
}
+struct workqueue_struct *rcu_gp_wq;
+
void __init rcu_init(void)
{
int cpu;
@@ -4219,6 +4195,10 @@ void __init rcu_init(void)
rcu_cpu_starting(cpu);
rcutree_online_cpu(cpu);
}
+
+ /* Create workqueue for expedited GPs and for Tree SRCU. */
+ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_gp_wq);
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6488a3b0e729..f491ab4f2e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -146,12 +146,6 @@ struct rcu_node {
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
- unsigned long n_tasks_boosted;
- /* Total number of tasks boosted. */
- unsigned long n_exp_boosts;
- /* Number of tasks boosted for expedited GP. */
- unsigned long n_normal_boosts;
- /* Number of tasks boosted for normal GP. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -184,13 +178,6 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
-/* Index values for nxttail array in struct rcu_data. */
-#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL 3
-#define RCU_NEXT_SIZE 4
-
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -217,8 +204,6 @@ struct rcu_data {
/* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
- unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
- unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -234,18 +219,7 @@ struct rcu_data {
/* Grace period that needs help */
/* from cond_resched(). */
- /* 5) __rcu_pending() statistics. */
- unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
- unsigned long n_rp_core_needs_qs;
- unsigned long n_rp_report_qs;
- unsigned long n_rp_cb_ready;
- unsigned long n_rp_cpu_needs_gp;
- unsigned long n_rp_gp_completed;
- unsigned long n_rp_gp_started;
- unsigned long n_rp_nocb_defer_wakeup;
- unsigned long n_rp_need_nothing;
-
- /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
+ /* 5) _rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
@@ -256,7 +230,7 @@ struct rcu_data {
atomic_long_t exp_workdone3; /* # done by others #3. */
int exp_dynticks_snap; /* Double-check need for IPI. */
- /* 7) Callback offloading. */
+ /* 6) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct rcu_head *nocb_head; /* CBs waiting for kthread. */
struct rcu_head **nocb_tail;
@@ -283,7 +257,7 @@ struct rcu_data {
/* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- /* 8) RCU CPU stall data. */
+ /* 7) RCU CPU stall data. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
struct irq_work rcu_iw; /* Check for non-irq activity. */
@@ -374,10 +348,6 @@ struct rcu_state {
/* kthreads, if configured. */
unsigned long n_force_qs; /* Number of calls to */
/* force_quiescent_state(). */
- unsigned long n_force_qs_lh; /* ~Number of calls leaving */
- /* due to lock unavailable. */
- unsigned long n_force_qs_ngp; /* Number of calls leaving */
- /* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
unsigned long gp_activity; /* Time of last GP kthread */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 46d61b597731..f72eefab8543 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
}
/*
+ * Return then value that expedited-grace-period counter will have
+ * at the end of the current grace period.
+ */
+static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp)
+{
+ return rcu_seq_endval(&rsp->expedited_sequence);
+}
+
+/*
* Record the end of an expedited grace period.
*/
static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
@@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
int ret;
struct rcu_node *rnp;
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
sync_exp_reset_tree(rsp);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
- for_each_leaf_node_possible_cpu(rnp, cpu) {
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+ int snap;
- rdp->exp_dynticks_snap =
- rcu_dynticks_snap(rdp->dynticks);
if (raw_smp_processor_id() == cpu ||
- rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
- !(rnp->qsmaskinitnext & rdp->grpmask))
- mask_ofl_test |= rdp->grpmask;
+ !(rnp->qsmaskinitnext & mask)) {
+ mask_ofl_test |= mask;
+ } else {
+ snap = rcu_dynticks_snap(rdtp);
+ if (rcu_dynticks_in_eqs(snap))
+ mask_ofl_test |= mask;
+ else
+ rdp->exp_dynticks_snap = snap;
+ }
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
@@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_possible_cpu(rnp, cpu) {
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
@@ -417,6 +435,7 @@ retry_ipi:
(rnp->expmask & mask)) {
/* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
schedule_timeout_uninterruptible(1);
goto retry_ipi;
}
@@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
struct rcu_node *rnp_root = rcu_get_root(rsp);
int ret;
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait"));
jiffies_stall = rcu_jiffies_till_stall_check();
jiffies_start = jiffies;
@@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
rew.rew_rsp = rsp;
rew.rew_s = s;
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ queue_work(rcu_gp_wq, &rew.rew_work);
}
/* Wait for expedited grace period to complete. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fb88a028deec..84fbee4686d3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
WARN_ON_ONCE(rdp->mynode != rnp);
WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
@@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
}
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
sched_show_task(t);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp)
* expedited grace period must boost all blocked tasks, including
* those blocking the pre-existing normal grace period.
*/
- if (rnp->exp_tasks != NULL) {
+ if (rnp->exp_tasks != NULL)
tb = rnp->exp_tasks;
- rnp->n_exp_boosts++;
- } else {
+ else
tb = rnp->boost_tasks;
- rnp->n_normal_boosts++;
- }
- rnp->n_tasks_boosted++;
/*
* We boost task t by manufacturing an rt_mutex that appears to
@@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
struct task_struct *t;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
@@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
char *ticks_title;
unsigned long ticks_value;
+ /*
+ * We could be printing a lot while holding a spinlock. Avoid
+ * triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+
if (rsp->gpnum == rdp->gpnum) {
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
@@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg)
smp_mb__before_atomic(); /* _add after CB invocation. */
atomic_long_add(-c, &rdp->nocb_q_count);
atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
- rdp->n_nocbs_invoked += c;
}
return 0;
}
@@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void)
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
rcu_nocb_mask);
}
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 332303be4beb..15b10e210a6b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -22,7 +22,7 @@ static DEFINE_SPINLOCK(sched_debug_lock);
if (m) \
seq_printf(m, x); \
else \
- printk(x); \
+ pr_cont(x); \
} while (0)
/*
@@ -475,12 +475,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
{
struct task_struct *g, *p;
- SEQ_printf(m,
- "\nrunnable tasks:\n"
- " S task PID tree-key switches prio"
- " wait-time sum-exec sum-sleep\n"
- "-------------------------------------------------------"
- "----------------------------------------------------\n");
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "runnable tasks:\n");
+ SEQ_printf(m, " S task PID tree-key switches prio"
+ " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, "-------------------------------------------------------"
+ "----------------------------------------------------\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -501,9 +501,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
#else
- SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
@@ -571,9 +573,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
- SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
#else
- SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:\n", cpu);
#endif
#define P(x) \
@@ -600,7 +604,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
struct dl_bw *dl_bw;
- SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "dl_rq[%d]:\n", cpu);
#define PU(x) \
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f6b5f19223d6..78eabc41eaa6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -113,16 +113,6 @@ config NO_HZ_FULL
endchoice
-config NO_HZ_FULL_ALL
- bool "Full dynticks system on all CPUs by default (except CPU 0)"
- depends on NO_HZ_FULL
- help
- If the user doesn't pass the nohz_full boot option to
- define the range of full dynticks CPUs, consider that all
- CPUs in the system are full dynticks by default.
- Note the boot CPU will still be kept outside the range to
- handle the timekeeping duty.
-
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 75043046914e..10b7186d0638 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -50,6 +50,7 @@
#include <linux/export.h>
#include <linux/hashtable.h>
#include <linux/compat.h>
+#include <linux/nospec.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -1346,11 +1347,15 @@ static const struct k_clock * const posix_clocks[] = {
static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
- if (id < 0)
+ clockid_t idx = id;
+
+ if (id < 0) {
return (id & CLOCKFD_MASK) == CLOCKFD ?
&clock_posix_dynamic : &clock_posix_cpu;
+ }
- if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
+ if (id >= ARRAY_SIZE(posix_clocks))
return NULL;
- return posix_clocks[id];
+
+ return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ab92aa4442df..5d4a0342f934 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu)
return 0;
}
-static int tick_nohz_init_all(void)
-{
- int err = -1;
-
-#ifdef CONFIG_NO_HZ_FULL_ALL
- if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
- WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
- return err;
- }
- err = 0;
- cpumask_setall(tick_nohz_full_mask);
- tick_nohz_full_running = true;
-#endif
- return err;
-}
-
void __init tick_nohz_init(void)
{
int cpu, ret;
- if (!tick_nohz_full_running) {
- if (tick_nohz_init_all() < 0)
- return;
- }
+ if (!tick_nohz_full_running)
+ return;
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c0a9e310d715..01e6b3a38871 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -661,7 +661,41 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto_tp;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto_tp;
+ default:
+ return tracing_func_proto(func_id);
+ }
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
+ return true;
+}
+
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = tp_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
+BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
struct bpf_perf_event_value *, buf, u32, size)
{
int err = -EINVAL;
@@ -678,8 +712,8 @@ clear:
return err;
}
-static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
- .func = bpf_perf_prog_read_value_tp,
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
+ .func = bpf_perf_prog_read_value,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -687,7 +721,7 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
@@ -695,34 +729,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
case BPF_FUNC_perf_prog_read_value:
- return &bpf_perf_prog_read_value_proto_tp;
+ return &bpf_perf_prog_read_value_proto;
default:
return tracing_func_proto(func_id);
}
}
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- struct bpf_insn_access_aux *info)
-{
- if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
-
- BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
- return true;
-}
-
-const struct bpf_verifier_ops tracepoint_verifier_ops = {
- .get_func_proto = tp_prog_func_proto,
- .is_valid_access = tp_prog_is_valid_access,
-};
-
-const struct bpf_prog_ops tracepoint_prog_ops = {
-};
-
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
@@ -779,7 +791,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
}
const struct bpf_verifier_ops perf_event_verifier_ops = {
- .get_func_proto = tp_prog_func_proto,
+ .get_func_proto = pe_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 55d6dff37daf..2c416509b834 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include "trace.h"
+#include "trace_probe.h"
static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
@@ -237,6 +238,107 @@ void perf_trace_destroy(struct perf_event *p_event)
mutex_unlock(&event_mutex);
}
+#ifdef CONFIG_KPROBE_EVENTS
+int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
+{
+ int ret;
+ char *func = NULL;
+ struct trace_event_call *tp_event;
+
+ if (p_event->attr.kprobe_func) {
+ func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+ ret = strncpy_from_user(
+ func, u64_to_user_ptr(p_event->attr.kprobe_func),
+ KSYM_NAME_LEN);
+ if (ret < 0)
+ goto out;
+
+ if (func[0] == '\0') {
+ kfree(func);
+ func = NULL;
+ }
+ }
+
+ tp_event = create_local_trace_kprobe(
+ func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
+ p_event->attr.probe_offset, is_retprobe);
+ if (IS_ERR(tp_event)) {
+ ret = PTR_ERR(tp_event);
+ goto out;
+ }
+
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ destroy_local_trace_kprobe(tp_event);
+out:
+ kfree(func);
+ return ret;
+}
+
+void perf_kprobe_destroy(struct perf_event *p_event)
+{
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+
+ destroy_local_trace_kprobe(p_event->tp_event);
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
+#ifdef CONFIG_UPROBE_EVENTS
+int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
+{
+ int ret;
+ char *path = NULL;
+ struct trace_event_call *tp_event;
+
+ if (!p_event->attr.uprobe_path)
+ return -EINVAL;
+ path = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+ ret = strncpy_from_user(
+ path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
+ if (ret < 0)
+ goto out;
+ if (path[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ tp_event = create_local_trace_uprobe(
+ path, p_event->attr.probe_offset, is_retprobe);
+ if (IS_ERR(tp_event)) {
+ ret = PTR_ERR(tp_event);
+ goto out;
+ }
+
+ /*
+ * local trace_uprobe need to hold event_mutex to call
+ * uprobe_buffer_enable() and uprobe_buffer_disable().
+ * event_mutex is not required for local trace_kprobes.
+ */
+ mutex_lock(&event_mutex);
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ destroy_local_trace_uprobe(tp_event);
+ mutex_unlock(&event_mutex);
+out:
+ kfree(path);
+ return ret;
+}
+
+void perf_uprobe_destroy(struct perf_event *p_event)
+{
+ mutex_lock(&event_mutex);
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+ mutex_unlock(&event_mutex);
+ destroy_local_trace_uprobe(p_event->tp_event);
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1fad24acd444..1cd3fb4d70f8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
disable_kprobe(&tk->rp.kp);
wait = 1;
}
+
+ /*
+ * if tk is not added to any list, it must be a local trace_kprobe
+ * created with perf_event_open. We don't need to wait for these
+ * trace_kprobes
+ */
+ if (list_empty(&tk->list))
+ wait = 0;
out:
if (wait) {
/*
@@ -659,7 +667,7 @@ static int create_trace_kprobe(int argc, char **argv)
char *symbol = NULL, *event = NULL, *group = NULL;
int maxactive = 0;
char *arg;
- unsigned long offset = 0;
+ long offset = 0;
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
@@ -747,7 +755,7 @@ static int create_trace_kprobe(int argc, char **argv)
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
- if (ret) {
+ if (ret || offset < 0 || offset > UINT_MAX) {
pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
@@ -1358,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};
-static int register_kprobe_event(struct trace_kprobe *tk)
+static inline void init_trace_event_call(struct trace_kprobe *tk,
+ struct trace_event_call *call)
{
- struct trace_event_call *call = &tk->tp.call;
- int ret;
-
- /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
@@ -1372,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk)
call->event.funcs = &kprobe_funcs;
call->class->define_fields = kprobe_event_define_fields;
}
+
+ call->flags = TRACE_EVENT_FL_KPROBE;
+ call->class->reg = kprobe_register;
+ call->data = tk;
+}
+
+static int register_kprobe_event(struct trace_kprobe *tk)
+{
+ struct trace_event_call *call = &tk->tp.call;
+ int ret = 0;
+
+ init_trace_event_call(tk, call);
+
if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
ret = register_trace_event(&call->event);
@@ -1379,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = TRACE_EVENT_FL_KPROBE;
- call->class->reg = kprobe_register;
- call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n",
@@ -1403,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}
+#ifdef CONFIG_PERF_EVENTS
+/* create a trace_kprobe, but don't add it to global lists */
+struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+ bool is_return)
+{
+ struct trace_kprobe *tk;
+ int ret;
+ char *event;
+
+ /*
+ * local trace_kprobes are not added to probe_list, so they are never
+ * searched in find_trace_kprobe(). Therefore, there is no concern of
+ * duplicated name here.
+ */
+ event = func ? func : "DUMMY_EVENT";
+
+ tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func,
+ offs, 0 /* maxactive */, 0 /* nargs */,
+ is_return);
+
+ if (IS_ERR(tk)) {
+ pr_info("Failed to allocate trace_probe.(%d)\n",
+ (int)PTR_ERR(tk));
+ return ERR_CAST(tk);
+ }
+
+ init_trace_event_call(tk, &tk->tp.call);
+
+ if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = __register_trace_kprobe(tk);
+ if (ret < 0)
+ goto error;
+
+ return &tk->tp.call;
+error:
+ free_trace_kprobe(tk);
+ return ERR_PTR(ret);
+}
+
+void destroy_local_trace_kprobe(struct trace_event_call *event_call)
+{
+ struct trace_kprobe *tk;
+
+ tk = container_of(event_call, struct trace_kprobe, tp.call);
+
+ if (trace_probe_is_enabled(&tk->tp)) {
+ WARN_ON(1);
+ return;
+ }
+
+ __unregister_trace_kprobe(tk);
+ free_trace_kprobe(tk);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d59357308677..daf54bda4dc8 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
}
/* Split symbol and offset. */
-int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+int traceprobe_split_symbol_offset(char *symbol, long *offset)
{
char *tmp;
int ret;
@@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
if (!offset)
return -EINVAL;
- tmp = strchr(symbol, '+');
+ tmp = strpbrk(symbol, "+-");
if (tmp) {
- /* skip sign because kstrtoul doesn't accept '+' */
- ret = kstrtoul(tmp + 1, 0, offset);
+ ret = kstrtol(tmp, 0, offset);
if (ret)
return ret;
-
*tmp = '\0';
} else
*offset = 0;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index e101c5bb9eda..75daff22ccea 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name,
extern void traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
-extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
/* Sum up total data length for dynamic arraies (strings) */
static nokprobe_inline int
@@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
}
extern int set_print_fmt(struct trace_probe *tp, bool is_return);
+
+#ifdef CONFIG_PERF_EVENTS
+extern struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+ bool is_return);
+extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
+
+extern struct trace_event_call *
+create_local_trace_uprobe(char *name, unsigned long offs, bool is_return);
+extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
+#endif
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 268029ae1be6..2014f4351ae0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};
-static int register_uprobe_event(struct trace_uprobe *tu)
+static inline void init_trace_event_call(struct trace_uprobe *tu,
+ struct trace_event_call *call)
{
- struct trace_event_call *call = &tu->tp.call;
- int ret;
-
- /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &uprobe_funcs;
call->class->define_fields = uprobe_event_define_fields;
+ call->flags = TRACE_EVENT_FL_UPROBE;
+ call->class->reg = trace_uprobe_register;
+ call->data = tu;
+}
+
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+ struct trace_event_call *call = &tu->tp.call;
+ int ret = 0;
+
+ init_trace_event_call(tu, call);
+
if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
return -ENOMEM;
@@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu)
return -ENODEV;
}
- call->flags = TRACE_EVENT_FL_UPROBE;
- call->class->reg = trace_uprobe_register;
- call->data = tu;
ret = trace_add_event_call(call);
if (ret) {
@@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
+struct trace_event_call *
+create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
+{
+ struct trace_uprobe *tu;
+ struct inode *inode;
+ struct path path;
+ int ret;
+
+ ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = igrab(d_inode(path.dentry));
+ path_put(&path);
+
+ if (!inode || !S_ISREG(inode->i_mode)) {
+ iput(inode);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * local trace_kprobes are not added to probe_list, so they are never
+ * searched in find_trace_kprobe(). Therefore, there is no concern of
+ * duplicated name "DUMMY_EVENT" here.
+ */
+ tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0,
+ is_return);
+
+ if (IS_ERR(tu)) {
+ pr_info("Failed to allocate trace_uprobe.(%d)\n",
+ (int)PTR_ERR(tu));
+ return ERR_CAST(tu);
+ }
+
+ tu->offset = offs;
+ tu->inode = inode;
+ tu->filename = kstrdup(name, GFP_KERNEL);
+ init_trace_event_call(tu, &tu->tp.call);
+
+ if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ return &tu->tp.call;
+error:
+ free_trace_uprobe(tu);
+ return ERR_PTR(ret);
+}
+
+void destroy_local_trace_uprobe(struct trace_event_call *event_call)
+{
+ struct trace_uprobe *tu;
+
+ tu = container_of(event_call, struct trace_uprobe, tp.call);
+
+ kfree(tu->tp.call.print_fmt);
+ tu->tp.call.print_fmt = NULL;
+
+ free_trace_uprobe(tu);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
/* Make a trace interface for controling probe points */
static __init int init_uprobe_trace(void)
{