diff options
Diffstat (limited to 'kernel')
32 files changed, 1327 insertions, 492 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..43f95d190eea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1845,7 +1845,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr = {}; int err; - if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) + if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; err = check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/events/core.c b/kernel/events/core.c index 4b838470fac4..fc1c330c6bd6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; - list_for_each_entry(sibling, &leader->sibling_list, group_entry) + for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } @@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp) static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; + + if (cgrp) { + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + __update_cgrp_time(cgrp); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; + struct cgroup_subsys_state *css; /* * ctx->lock held by caller @@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, return; cgrp = perf_cgroup_from_task(task, ctx); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; + } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); @@ -937,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event, if (!is_cgroup_event(event)) return; - if (add && ctx->nr_cgroups++) - return; - else if (!add && --ctx->nr_cgroups) - return; /* * Because cgroup events are always per-cpu events, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ - if (add) { + + /* + * Since setting cpuctx->cgrp is conditional on the current @cgrp + * matching the event's cgroup, we must do this for every new event, + * because if the first would mismatch, the second would not try again + * and we would leave cpuctx->cgrp unset. + */ + if (add && !cpuctx->cgrp) { struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) cpuctx->cgrp = cgrp; - } else { - list_del(cpuctx_entry); - cpuctx->cgrp = NULL; } + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + + /* no cgroup running */ + if (!add) + cpuctx->cgrp = NULL; + + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + if (add) + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + else + list_del(cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1041,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event, static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - int rotations = 0; + bool rotations; lockdep_assert_irqs_disabled(); @@ -1460,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Helper function to initialize event group nodes. + */ +static void init_event_group(struct perf_event *event) +{ + RB_CLEAR_NODE(&event->group_node); + event->group_index = 0; +} + +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits. + */ +static struct perf_event_groups * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1470,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) } /* + * Helper function to initializes perf_event_group trees. + */ +static void perf_event_groups_init(struct perf_event_groups *groups) +{ + groups->tree = RB_ROOT; + groups->index = 0; +} + +/* + * Compare function for event groups; + * + * Implements complex key that first sorts by CPU and then by virtual index + * which provides ordering when rotating groups for the same CPU. + */ +static bool +perf_event_groups_less(struct perf_event *left, struct perf_event *right) +{ + if (left->cpu < right->cpu) + return true; + if (left->cpu > right->cpu) + return false; + + if (left->group_index < right->group_index) + return true; + if (left->group_index > right->group_index) + return false; + + return false; +} + +/* + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for + * key (see perf_event_groups_less). This places it last inside the CPU + * subtree. + */ +static void +perf_event_groups_insert(struct perf_event_groups *groups, + struct perf_event *event) +{ + struct perf_event *node_event; + struct rb_node *parent; + struct rb_node **node; + + event->group_index = ++groups->index; + + node = &groups->tree.rb_node; + parent = *node; + + while (*node) { + parent = *node; + node_event = container_of(*node, struct perf_event, group_node); + + if (perf_event_groups_less(event, node_event)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&event->group_node, parent, node); + rb_insert_color(&event->group_node, &groups->tree); +} + +/* + * Helper function to insert event into the pinned or flexible groups. + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Delete a group from a tree. + */ +static void +perf_event_groups_delete(struct perf_event_groups *groups, + struct perf_event *event) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || + RB_EMPTY_ROOT(&groups->tree)); + + rb_erase(&event->group_node, &groups->tree); + init_event_group(event); +} + +/* + * Helper function to delete event from its groups. + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Get the leftmost event in the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_first(struct perf_event_groups *groups, int cpu) +{ + struct perf_event *node_event = NULL, *match = NULL; + struct rb_node *node = groups->tree.rb_node; + + while (node) { + node_event = container_of(node, struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + match = node_event; + node = node->rb_left; + } + } + + return match; +} + +/* + * Like rb_entry_next_safe() for the @cpu subtree. + */ +static struct perf_event * +perf_event_groups_next(struct perf_event *event) +{ + struct perf_event *next; + + next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); + if (next && next->cpu == event->cpu) + return next; + + return NULL; +} + +/* + * Iterate through the whole groups tree. + */ +#define perf_event_groups_for_each(event, groups) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), group_node); event; \ + event = rb_entry_safe(rb_next(&event->group_node), \ + typeof(*event), group_node)) + +/* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1489,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } list_update_cgroup_event(event, ctx, true); @@ -1652,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event) group_leader->group_caps &= event->event_caps; - list_add_tail(&event->group_entry, &group_leader->sibling_list); + list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; perf_event__header_size(group_leader); - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } @@ -1688,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); /* * If event was in error state, then keep it @@ -1706,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; - struct list_head *list = NULL; + struct perf_event_context *ctx = event->ctx; - lockdep_assert_held(&event->ctx->lock); + lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. @@ -1722,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event) * If this is a sibling, remove it from its group. */ if (event->group_leader != event) { - list_del_init(&event->group_entry); + list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + sibling->group_leader = sibling; + list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (!RB_EMPTY_NODE(&event->group_node)) { + add_event_to_groups(sibling, event->ctx); + + if (sibling->state == PERF_EVENT_STATE_ACTIVE) { + struct list_head *list = sibling->attr.pinned ? + &ctx->pinned_active : &ctx->flexible_active; + + list_add_tail(&sibling->active_list, list); + } + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: perf_event__header_size(event->group_leader); - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + for_each_sibling_event(tmp, event->group_leader) perf_event__header_size(tmp); } @@ -1772,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event) */ static inline int pmu_filter_match(struct perf_event *event) { - struct perf_event *child; + struct perf_event *sibling; if (!__pmu_filter_match(event)) return 0; - list_for_each_entry(child, &event->sibling_list, group_entry) { - if (!__pmu_filter_match(child)) + for_each_sibling_event(sibling, event) { + if (!__pmu_filter_match(sibling)) return 0; } @@ -1805,6 +1995,13 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + /* + * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but + * we can schedule events _OUT_ individually through things like + * __perf_remove_from_context(). + */ + list_del_init(&event->active_list); + perf_pmu_disable(event->pmu); event->pmu->del(event, 0); @@ -1845,7 +2042,7 @@ group_sched_out(struct perf_event *group_event, /* * Schedule out siblings (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) + for_each_sibling_event(event, group_event) event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); @@ -2124,7 +2321,7 @@ group_sched_in(struct perf_event *group_event, /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + for_each_sibling_event(event, group_event) { if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; @@ -2140,7 +2337,7 @@ group_error: * partial group before returning: * The events up to the failed event are scheduled out normally. */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + for_each_sibling_event(event, group_event) { if (event == partial_group) break; @@ -2317,6 +2514,18 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&task_ctx->lock); } +#ifdef CONFIG_CGROUP_PERF + if (is_cgroup_event(event)) { + /* + * If the current cgroup doesn't match the event's + * cgroup, we should not try to schedule it. + */ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + reprogram = cgroup_is_descendant(cgrp->css.cgroup, + event->cgrp->css.cgroup); + } +#endif + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); @@ -2650,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh) } EXPORT_SYMBOL_GPL(perf_event_refresh); +static int perf_event_modify_breakpoint(struct perf_event *bp, + struct perf_event_attr *attr) +{ + int err; + + _perf_event_disable(bp); + + err = modify_user_hw_breakpoint_check(bp, attr, true); + if (err) { + if (!bp->attr.disabled) + _perf_event_enable(bp); + + return err; + } + + if (!attr->disabled) + _perf_event_enable(bp); + return 0; +} + +static int perf_event_modify_attr(struct perf_event *event, + struct perf_event_attr *attr) +{ + if (event->attr.type != attr->type) + return -EINVAL; + + switch (event->attr.type) { + case PERF_TYPE_BREAKPOINT: + return perf_event_modify_breakpoint(event, attr); + default: + /* Place holder for future additions. */ + return -EOPNOTSUPP; + } +} + static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { + struct perf_event *event, *tmp; int is_active = ctx->is_active; - struct perf_event *event; lockdep_assert_held(&ctx->lock); @@ -2702,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) group_sched_out(event, cpuctx, ctx); } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) group_sched_out(event, cpuctx, ctx); } perf_pmu_enable(ctx->pmu); @@ -2994,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static int visit_groups_merge(struct perf_event_groups *groups, int cpu, + int (*func)(struct perf_event *, void *), void *data) { - struct perf_event *event; + struct perf_event **evt, *evt1, *evt2; + int ret; - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; + evt1 = perf_event_groups_first(groups, -1); + evt2 = perf_event_groups_first(groups, cpu); + + while (evt1 || evt2) { + if (evt1 && evt2) { + if (evt1->group_index < evt2->group_index) + evt = &evt1; + else + evt = &evt2; + } else if (evt1) { + evt = &evt1; + } else { + evt = &evt2; + } - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); + ret = func(*evt, data); + if (ret) + return ret; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + *evt = perf_event_groups_next(*evt); } + + return 0; +} + +struct sched_in_data { + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + int can_add_hw; +}; + +static int pinned_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->pinned_active); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + return 0; +} + +static int flexible_sched_in(struct perf_event *event, void *data) +{ + struct sched_in_data *sid = data; + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { + if (!group_sched_in(event, sid->cpuctx, sid->ctx)) + list_add_tail(&event->active_list, &sid->ctx->flexible_active); + else + sid->can_add_hw = 0; + } + + return 0; +} + +static void +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; + + visit_groups_merge(&ctx->pinned_groups, + smp_processor_id(), + pinned_sched_in, &sid); } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_event *event; - int can_add_hw = 1; - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; + struct sched_in_data sid = { + .ctx = ctx, + .cpuctx = cpuctx, + .can_add_hw = 1, + }; - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } + visit_groups_merge(&ctx->flexible_groups, + smp_processor_id(), + flexible_sched_in, &sid); } static void @@ -3121,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); @@ -3350,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } /* - * Round-robin a context's events: + * Move @event to the tail of the @ctx's elegible events. */ -static void rotate_ctx(struct perf_event_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (ctx->rotate_disable) + return; + + perf_event_groups_delete(&ctx->flexible_groups, event); + perf_event_groups_insert(&ctx->flexible_groups, event); +} + +static inline struct perf_event * +ctx_first_active(struct perf_event_context *ctx) +{ + return list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx) +static bool perf_rotate_context(struct perf_cpu_context *cpuctx) { + struct perf_event *cpu_event = NULL, *task_event = NULL; + bool cpu_rotate = false, task_rotate = false; struct perf_event_context *ctx = NULL; - int rotate = 0; + + /* + * Since we run this from IRQ context, nobody can install new + * events, thus the event count values are stable. + */ if (cpuctx->ctx.nr_events) { if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + cpu_rotate = true; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { if (ctx->nr_events != ctx->nr_active) - rotate = 1; + task_rotate = true; } - if (!rotate) - goto done; + if (!(cpu_rotate || task_rotate)) + return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) + if (task_rotate) + task_event = ctx_first_active(ctx); + if (cpu_rotate) + cpu_event = ctx_first_active(&cpuctx->ctx); + + /* + * As per the order given at ctx_resched() first 'pop' task flexible + * and then, if needed CPU flexible. + */ + if (task_event || (ctx && cpu_event)) ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (cpu_event) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + if (task_event) + rotate_ctx(ctx, task_event); + if (cpu_event) + rotate_ctx(&cpuctx->ctx, cpu_event); perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -done: - return rotate; + return true; } void perf_event_task_tick(void) @@ -3543,7 +3876,7 @@ static void __perf_event_read(void *info) pmu->read(event); - list_for_each_entry(sub, &event->sibling_list, group_entry) { + for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3717,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx) raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + perf_event_groups_init(&ctx->pinned_groups); + perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); + INIT_LIST_HEAD(&ctx->pinned_active); + INIT_LIST_HEAD(&ctx->flexible_active); atomic_set(&ctx->refcount, 1); } @@ -4389,7 +4724,7 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -4583,7 +4918,7 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - list_for_each_entry(sibling, &event->sibling_list, group_entry) + for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } @@ -4665,6 +5000,8 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -4737,6 +5074,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); + + case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { + struct perf_event_attr new_attr; + int err = perf_copy_attr((struct perf_event_attr __user *)arg, + &new_attr); + + if (err) + return err; + + return perf_event_modify_attr(event, &new_attr); + } default: return -ENOTTY; } @@ -5732,7 +6080,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if (leader != event) + if ((leader != event) && + (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); @@ -5741,7 +6090,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && @@ -7998,9 +8347,119 @@ static struct pmu perf_tracepoint = { .read = perf_swevent_read, }; +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +/* + * Flags in config, used by dynamic PMU kprobe and uprobe + * The flags should match following PMU_FORMAT_ATTR(). + * + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe + * if not set, create kprobe/uprobe + */ +enum perf_probe_config { + PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ +}; + +PMU_FORMAT_ATTR(retprobe, "config:0"); + +static struct attribute *probe_attrs[] = { + &format_attr_retprobe.attr, + NULL, +}; + +static struct attribute_group probe_format_group = { + .name = "format", + .attrs = probe_attrs, +}; + +static const struct attribute_group *probe_attr_groups[] = { + &probe_format_group, + NULL, +}; +#endif + +#ifdef CONFIG_KPROBE_EVENTS +static int perf_kprobe_event_init(struct perf_event *event); +static struct pmu perf_kprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_kprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_kprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_kprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_kprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_kprobe_destroy; + + return 0; +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +static int perf_uprobe_event_init(struct perf_event *event); +static struct pmu perf_uprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_uprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_uprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_uprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_uprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_uprobe_destroy; + + return 0; +} +#endif /* CONFIG_UPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +#ifdef CONFIG_KPROBE_EVENTS + perf_pmu_register(&perf_kprobe, "kprobe", -1); +#endif +#ifdef CONFIG_UPROBE_EVENTS + perf_pmu_register(&perf_uprobe, "uprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8077,13 +8536,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event) } #endif +/* + * returns true if the event is a tracepoint, or a kprobe/upprobe created + * with perf_event_open() + */ +static inline bool perf_event_is_tracing(struct perf_event *event) +{ + if (event->pmu == &perf_tracepoint) + return true; +#ifdef CONFIG_KPROBE_EVENTS + if (event->pmu == &perf_kprobe) + return true; +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->pmu == &perf_uprobe) + return true; +#endif + return false; +} + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int ret; - if (event->attr.type != PERF_TYPE_TRACEPOINT) + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog_fd); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; @@ -8129,7 +8607,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static void perf_event_free_bpf_prog(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) { + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } @@ -8325,7 +8803,8 @@ restart: * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * - * if <size> is not specified, the range is treated as a single address. + * if <size> is not specified or is zero, the range is treated as a single + * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, @@ -8375,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { + static const enum perf_addr_filter_action_t actions[] = { + [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, + [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, + [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, + }; ret = -EINVAL; if (!*start) @@ -8391,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, switch (token) { case IF_ACT_FILTER: case IF_ACT_START: - filter->filter = 1; - case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; + filter->action = actions[token]; state = IF_STATE_SOURCE; break; @@ -8409,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_SOURCE) goto fail; - if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) - filter->range = 1; - *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; - if (filter->range) { + if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) @@ -8425,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { - int fpos = filter->range ? 2 : 1; + int fpos = token == IF_SRC_FILE ? 2 : 1; filename = match_strdup(&args[fpos]); if (!filename) { @@ -8451,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (kernel && event->attr.exclude_kernel) goto fail; + /* + * ACTION "filter" must have a non-zero length region + * specified. + */ + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && + !filter->size) + goto fail; + if (!kernel) { if (!filename) goto fail; @@ -8548,47 +9036,36 @@ fail_clear_files: return ret; } -static int -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) -{ - struct perf_event_context *ctx = event->ctx; - int ret; - - /* - * Beware, here be dragons!! - * - * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint - * stuff does not actually need it. So temporarily drop ctx->mutex. As per - * perf_event_ctx_lock() we already have a reference on ctx. - * - * This can result in event getting moved to a different ctx, but that - * does not affect the tracepoint state. - */ - mutex_unlock(&ctx->mutex); - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - mutex_lock(&ctx->mutex); - - return ret; -} - static int perf_event_set_filter(struct perf_event *event, void __user *arg) { - char *filter_str; int ret = -EINVAL; - - if ((event->attr.type != PERF_TYPE_TRACEPOINT || - !IS_ENABLED(CONFIG_EVENT_TRACING)) && - !has_addr_filter(event)) - return -EINVAL; + char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); - if (IS_ENABLED(CONFIG_EVENT_TRACING) && - event->attr.type == PERF_TYPE_TRACEPOINT) - ret = perf_tracepoint_set_filter(event, filter_str); - else if (has_addr_filter(event)) +#ifdef CONFIG_EVENT_TRACING + if (perf_event_is_tracing(event)) { + struct perf_event_context *ctx = event->ctx; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but + * the tracepoint stuff does not actually need it. So + * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we + * already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, + * but that does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + } else +#endif + if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); @@ -9441,9 +9918,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->active_list); + init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); @@ -9718,6 +10196,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (!attr->sample_max_stack) + attr->sample_max_stack = sysctl_perf_event_max_stack; + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); out: @@ -9931,9 +10412,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; - if (!attr.sample_max_stack) - attr.sample_max_stack = sysctl_perf_event_max_stack; - /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -10207,8 +10685,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(group_leader, 0); put_ctx(gctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -10229,8 +10706,7 @@ SYSCALL_DEFINE5(perf_event_open, * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); @@ -10869,7 +11345,7 @@ static int inherit_group(struct perf_event *parent_event, * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) @@ -10968,7 +11444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) @@ -10984,7 +11460,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e14588..6e28d2866be5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -44,6 +44,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/bug.h> #include <linux/hw_breakpoint.h> /* @@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp) return 1; } -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +static inline enum bp_type_idx find_slot_idx(u64 bp_type) { - if (bp->attr.bp_type & HW_BREAKPOINT_RW) + if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; @@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.target == tsk && - find_slot_idx(iter) == type && + find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } @@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ -static int __reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { struct bp_busy_slots slots = {0}; enum bp_type_idx type; @@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp) return -ENOMEM; /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) + if (bp_type == HW_BREAKPOINT_EMPTY || + bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; - type = find_slot_idx(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); @@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - ret = __reserve_bp_slot(bp); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); return ret; } -static void __release_bp_slot(struct perf_event *bp) +static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; - type = find_slot_idx(bp); + type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); toggle_bp_slot(bp, false, type, weight); } @@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); mutex_unlock(&nr_bp_mutex); } +static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int err; + + __release_bp_slot(bp, old_type); + + err = __reserve_bp_slot(bp, bp->attr.bp_type); + if (err) { + /* + * Reserve the old_type slot back in case + * there's no space for the new type. + * + * This must succeed, because we just released + * the old_type slot in the __release_bp_slot + * call above. If not, something is broken. + */ + WARN_ON(__reserve_bp_slot(bp, old_type)); + } + + return err; +} + +static int modify_bp_slot(struct perf_event *bp, u64 old_type) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + ret = __modify_bp_slot(bp, old_type); + mutex_unlock(&nr_bp_mutex); + return ret; +} + /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and @@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - return __reserve_bp_slot(bp); + return __reserve_bp_slot(bp, bp->attr.bp_type); } int dbg_release_bp_slot(struct perf_event *bp) @@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp) if (mutex_is_locked(&nr_bp_mutex)) return -1; - __release_bp_slot(bp); + __release_bp_slot(bp, bp->attr.bp_type); return 0; } @@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + bool modify = attr->bp_type != old_type; + int err = 0; + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (check && memcmp(&bp->attr, attr, sizeof(*attr))) + return -EINVAL; + + err = validate_hw_breakpoint(bp); + if (!err && modify) + err = modify_bp_slot(bp, old_type); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + return err; + } + + bp->attr.disabled = attr->disabled; + return 0; +} + /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - if (attr->disabled) - goto end; + if (!attr->disabled) { + int err = modify_user_hw_breakpoint_check(bp, attr, false); - err = validate_hw_breakpoint(bp); - if (!err) + if (err) + return err; perf_event_enable(bp); - - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); - - return err; + bp->attr.disabled = 0; } - -end: - bp->attr.disabled = attr->disabled; - return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 21b0122cb39c..1d5632d8bbcc 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -14,6 +14,15 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); +static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs, + unsigned long flags) +{ + /* + * A dummy post handler is required to prohibit optimizing, because + * jump optimization does not support execution path overriding. + */ +} + struct fei_attr { struct list_head list; struct kprobe kp; @@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) return NULL; } attr->kp.pre_handler = fei_kprobe_handler; + attr->kp.post_handler = fei_post_handler; attr->retval = adjust_error_retval(addr, 0); INIT_LIST_HEAD(&attr->list); } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e7214093dcd1..01ebdf1f9f40 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@ #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> +#include <asm/sections.h> #ifdef HAVE_JUMP_LABEL @@ -421,15 +422,15 @@ void __init jump_label_init(void) cpus_read_unlock(); } -/* Disable any jump label entries in __init code */ -void __init jump_label_invalidate_init(void) +/* Disable any jump label entries in __init/__exit code */ +void __init jump_label_invalidate_initmem(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; for (iter = iter_start; iter < iter_stop; iter++) { - if (init_kernel_text(iter->code)) + if (init_section_contains((void *)(unsigned long)iter->code, 1)) iter->code = 0; } } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b5f83f1969..023386338269 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock) return; } + printk(KERN_CONT "%p", hlock->instance); print_lock_name(lock_classes + class_idx - 1); - printk(KERN_CONT ", at: [<%p>] %pS\n", - (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); + printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) @@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - printk("\nnew class %p: %s", class->key, class->name); + printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) } printk("%*s }\n", depth, ""); - printk("%*s ... key at: [<%p>] %pS\n", + printk("%*s ... key at: [<%px>] %pS\n", depth, "", class->key, class->key); } @@ -2340,7 +2340,7 @@ cache_hit: if (very_verbose(class)) { printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", + "%016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2349,7 +2349,7 @@ cache_hit: } if (very_verbose(class)) { - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", (unsigned long long)chain_key, class->key, class->name); } @@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + printk("hardirqs last enabled at (%u): [<%px>] %pS\n", curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, (void *)curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + printk("hardirqs last disabled at (%u): [<%px>] %pS\n", curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, (void *)curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): [<%p>] %pS\n", + printk("softirqs last enabled at (%u): [<%px>] %pS\n", curr->softirq_enable_event, (void *)curr->softirq_enable_ip, (void *)curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): [<%p>] %pS\n", + printk("softirqs last disabled at (%u): [<%px>] %pS\n", curr->softirq_disable_event, (void *)curr->softirq_disable_ip, (void *)curr->softirq_disable_ip); } @@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, * Sanity check, the lock-class key must be persistent: */ if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); + printk("BUG: key %px not in .data!\n", key); /* * What it says above ^^^^^, I suggest you read it. */ @@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } atomic_inc((atomic_t *)&class->ops); if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); + printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); @@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 858a07590e39..2048359f33d2 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1082,15 +1082,16 @@ static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock); /** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals. + * @lock: The mutex to be acquired. * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. + * Lock the mutex like mutex_lock(). If a signal is delivered while the + * process is sleeping, this function will return without acquiring the + * mutex. * - * This function is similar to (but not equivalent to) down_interruptible(). + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * signal arrived. */ int __sched mutex_lock_interruptible(struct mutex *lock) { @@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); +/** + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals. + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). If a signal which will be fatal to + * the current process is delivered while the process is sleeping, this + * function will return without acquiring the mutex. + * + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * fatal signal arrived. + */ int __sched mutex_lock_killable(struct mutex *lock) { might_sleep(); @@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +/** + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). While the task is waiting for this + * mutex, it will be accounted as being in the IO wait state by the + * scheduler. + * + * Context: Process context. + */ void __sched mutex_lock_io(struct mutex *lock) { int token; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 940633c63254..4f014be7a4b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(ret)) { __set_current_state(TASK_RUNNING); - if (rt_mutex_has_waiters(lock)) - remove_waiter(lock, &waiter); + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 68686b3ec3c1..d1d62f942be2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { - struct rt_mutex_waiter *w; - - w = rb_entry(lock->waiters.rb_leftmost, - struct rt_mutex_waiter, tree_entry); - BUG_ON(w->lock != lock); + struct rb_node *leftmost = rb_first_cached(&lock->waiters); + struct rt_mutex_waiter *w = NULL; + if (leftmost) { + w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + BUG_ON(w->lock != lock); + } return w; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f549c552dbf1..30465a2f2b6c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } @@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_clear_owner(sem); __up_write(sem); @@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_set_reader_owned(sem); __downgrade_write(sem); @@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { + DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a883b8f1fdc6..a17cba8d94bb 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,6 +16,12 @@ */ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +#else +# define DEBUG_RWSEMS_WARN_ON(c) +#endif + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * do a write to the rwsem cacheline when it is really necessary * to minimize cacheline contention. */ - if (sem->owner != RWSEM_READER_OWNED) + if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } diff --git a/kernel/memremap.c b/kernel/memremap.c index 4dd4274cabe2..895e6b76b25e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -427,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: err_radix: pgmap_radix_release(res, pgoff); - devres_free(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); diff --git a/kernel/module.c b/kernel/module.c index ad2d420024f6..e42764acedb4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4228,7 +4228,7 @@ static int modules_open(struct inode *inode, struct file *file) m->private = kallsyms_show_value() ? NULL : (void *)8ul; } - return 0; + return err; } static const struct file_operations proc_modules_operations = { diff --git a/kernel/panic.c b/kernel/panic.c index 4b794f1d8561..9d833d913c84 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -289,7 +289,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif - pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); + pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c1abd0..7a693e31184a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) WARN_ON_ONCE(rcu_seq_state(*sp) != 1); } +/* Compute the end-of-grace-period value for the specified sequence number. */ +static inline unsigned long rcu_seq_endval(unsigned long *sp) +{ + return (*sp | RCU_SEQ_STATE_MASK) + 1; +} + /* Adjust sequence number for end of update-side operation. */ static inline void rcu_seq_end(unsigned long *sp) { smp_mb(); /* Ensure update-side operation before counter increment. */ WARN_ON_ONCE(!rcu_seq_state(*sp)); - WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); + WRITE_ONCE(*sp, rcu_seq_endval(sp)); } /* Take a snapshot of the update side's sequence number. */ @@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) + for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + (cpu) <= rnp->grphi; \ + (cpu) = cpumask_next((cpu), cpu_possible_mask)) + +/* + * Iterate over all CPUs in a leaf RCU node's specified mask. + */ +#define rcu_find_next_bit(rnp, cpu, mask) \ + ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) +#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ + for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + (cpu) <= rnp->grphi; \ + (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) /* * Wrappers for the rcu_node::lock acquire and release. @@ -337,7 +353,7 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) #define raw_spin_trylock_rcu_node(p) \ ({ \ @@ -348,6 +364,9 @@ do { \ ___locked; \ }) +#define raw_lockdep_assert_held_rcu_node(p) \ + lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU @@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 -#ifdef CONFIG_TINY_RCU -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } -#else /* #ifdef CONFIG_TINY_RCU */ -void rcu_request_urgent_qs_task(struct task_struct *t); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, @@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9868bb..777e7a6a0292 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuperf.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + */ + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, 0, "Number of RCU reader threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fdbced8..680c96d8c00f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -909,34 +909,38 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) { + if (!can_expedite) pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Disabled dynamic grace-period expediting.\n", - torture_type); - } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; - else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) - pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); - if (gp_exp1 && cur_ops->exp_sync) + pr_info("%s: Testing conditional GPs.\n", __func__); + } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + pr_alert("%s: gp_cond without primitives.\n", __func__); + } + if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; - else if (gp_exp && !cur_ops->exp_sync) - pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); - if (gp_normal1 && cur_ops->deferred_free) + pr_info("%s: Testing expedited GPs.\n", __func__); + } else if (gp_exp && !cur_ops->exp_sync) { + pr_alert("%s: gp_exp without primitives.\n", __func__); + } + if (gp_normal1 && cur_ops->deferred_free) { synctype[nsynctypes++] = RTWS_DEF_FREE; - else if (gp_normal && !cur_ops->deferred_free) - pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); - if (gp_sync1 && cur_ops->sync) + pr_info("%s: Testing asynchronous GPs.\n", __func__); + } else if (gp_normal && !cur_ops->deferred_free) { + pr_alert("%s: gp_normal without primitives.\n", __func__); + } + if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; - else if (gp_sync && !cur_ops->sync) - pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); + pr_info("%s: Testing normal GPs.\n", __func__); + } else if (gp_sync && !cur_ops->sync) { + pr_alert("%s: gp_sync without primitives.\n", __func__); + } if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg) rcu_unexpedite_gp(); if (++expediting > 3) expediting = -expediting; + } else if (!can_expedite) { /* Disabled during boot, recheck. */ + can_expedite = !rcu_gp_is_expedited() && + !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); @@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg) while (can_expedite && expediting++ < 0) rcu_unexpedite_gp(); WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " Dynamic grace-period expediting was disabled.\n", + torture_type); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg) torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (torture_random(&rand) & 0x80) + if (cur_ops->sync && torture_random(&rand) & 0x80) cur_ops->sync(); - else + else if (cur_ops->exp_sync) cur_ops->exp_sync(); - } else if (gp_normal) { + } else if (gp_normal && cur_ops->sync) { cur_ops->sync(); - } else { + } else if (cur_ops->exp_sync) { cur_ops->exp_sync(); } stutter_wait("rcu_torture_fakewriter"); @@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void) atomic_set(&barrier_cbs_count, 0); atomic_set(&barrier_cbs_invoked, 0); barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), GFP_KERNEL); barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); + kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { @@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) * next grace period. Unlikely, but can happen. If it * does happen, the debug-objects subsystem won't have splatted. */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); + pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void) init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); /* Try to queue the rh2 pair of callbacks for the same grace period. */ preempt_disable(); /* Prevent preemption from interrupting test. */ @@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void) /* Wait for them all to get done so we can safely return. */ rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } @@ -1799,7 +1809,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; if (nfakewriters > 0) { - fakewriter_tasks = kzalloc(nfakewriters * + fakewriter_tasks = kcalloc(nfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1814,7 +1824,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("out of memory"); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..fb560fca9ef4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); @@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - lockdep_assert_held(&sp->lock); + lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, - &sdp->work, delay); + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); } /* @@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + bool last_lvl; int cpu; unsigned long flags; unsigned long gpseq; int idx; - int idxnext; unsigned long mask; struct srcu_data *sdp; struct srcu_node *snp; @@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - if (snp >= sp->level[rcu_num_lvls - 1]) + last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); @@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check)) + if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp) ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); spin_unlock_irq_rcu_node(sp); - /* Throttle expedited grace periods: Should be rare! */ - srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff - ? 0 : SRCU_INTERVAL); + srcu_reschedule(sp, 0); } else { spin_unlock_irq_rcu_node(sp); } @@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_rcu_node(sp, flags); - if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, - srcu_get_delay(sp)); + queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf39f276..2a734692a581 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) @@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_kthread ? rsp->gp_kthread->state : ~0, rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { + pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); } @@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * If RCU is idle, we just wait for the next grace period. @@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* * Pick up grace-period number for new callbacks. If this @@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) @@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -2296,7 +2297,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - lockdep_assert_held(&rcu_get_root(rsp)->lock); + raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); /* Walk up the rcu_node hierarchy. */ for (;;) { @@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->n_cbs_invoked += count; rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ @@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp) !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; + if (ret) return; - } rnp_old = rnp; } /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ @@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } @@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - rdp->n_rcu_pending++; - /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); @@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { - rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { - rdp->n_rp_report_qs++; + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) return 1; - } /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rdp->n_rp_cb_ready++; + if (rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; - } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; + if (cpu_needs_another_gp(rsp, rdp)) return 1; - } /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; + if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ return 1; - } /* Has a new RCU grace period started? */ if (READ_ONCE(rnp->gpnum) != rdp->gpnum || - unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ - rdp->n_rp_gp_started++; + unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - } /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) { - rdp->n_rp_nocb_defer_wakeup++; + if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - } /* nothing to do */ - rdp->n_rp_need_nothing++; return 0; } @@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; @@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); @@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_wq; + void __init rcu_init(void) { int cpu; @@ -4219,6 +4195,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b0e729..f491ab4f2e8e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -146,12 +146,6 @@ struct rcu_node { /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -184,13 +178,6 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -217,8 +204,6 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ - unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -234,18 +219,7 @@ struct rcu_data { /* Grace period that needs help */ /* from cond_resched(). */ - /* 5) __rcu_pending() statistics. */ - unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_core_needs_qs; - unsigned long n_rp_report_qs; - unsigned long n_rp_cb_ready; - unsigned long n_rp_cpu_needs_gp; - unsigned long n_rp_gp_completed; - unsigned long n_rp_gp_started; - unsigned long n_rp_nocb_defer_wakeup; - unsigned long n_rp_need_nothing; - - /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ + /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; @@ -256,7 +230,7 @@ struct rcu_data { atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 7) Callback offloading. */ + /* 6) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -283,7 +257,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 8) RCU CPU stall data. */ + /* 7) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -374,10 +348,6 @@ struct rcu_state { /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..f72eefab8543 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) } /* + * Return then value that expedited-grace-period counter will have + * at the end of the current grace period. + */ +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +{ + return rcu_seq_endval(&rsp->expedited_sequence); +} + +/* * Record the end of an expedited grace period. */ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) @@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, int ret; struct rcu_node *rnp; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; - rdp->exp_dynticks_snap = - rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || - !(rnp->qsmaskinitnext & rdp->grpmask)) - mask_ofl_test |= rdp->grpmask; + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) + mask_ofl_test |= mask; + else + rdp->exp_dynticks_snap = snap; + } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_possible_cpu(rnp, cpu) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); @@ -417,6 +435,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } @@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; @@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a028deec..84fbee4686d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); @@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); sched_show_task(t); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) * expedited grace period must boost all blocked tasks, including * those blocking the pre-existing normal grace period. */ - if (rnp->exp_tasks != NULL) { + if (rnp->exp_tasks != NULL) tb = rnp->exp_tasks; - rnp->n_exp_boosts++; - } else { + else tb = rnp->boost_tasks; - rnp->n_normal_boosts++; - } - rnp->n_tasks_boosted++; /* * We boost task t by manufacturing an rt_mutex that appears to @@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) char *ticks_title; unsigned long ticks_value; + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + if (rsp->gpnum == rdp->gpnum) { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; @@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); - rdp->n_nocbs_invoked += c; } return 0; } @@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void) cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 332303be4beb..15b10e210a6b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -22,7 +22,7 @@ static DEFINE_SPINLOCK(sched_debug_lock); if (m) \ seq_printf(m, x); \ else \ - printk(x); \ + pr_cont(x); \ } while (0) /* @@ -475,12 +475,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - SEQ_printf(m, - "\nrunnable tasks:\n" - " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n" - "-------------------------------------------------------" - "----------------------------------------------------\n"); + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); + SEQ_printf(m, " S task PID tree-key switches prio" + " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, "-------------------------------------------------------" + "----------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -501,9 +501,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); #else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -571,9 +573,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); #else - SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:\n", cpu); #endif #define P(x) \ @@ -600,7 +604,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { struct dl_bw *dl_bw; - SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "dl_rq[%d]:\n", cpu); #define PU(x) \ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19223d6..78eabc41eaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -113,16 +113,6 @@ config NO_HZ_FULL endchoice -config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default (except CPU 0)" - depends on NO_HZ_FULL - help - If the user doesn't pass the nohz_full boot option to - define the range of full dynticks CPUs, consider that all - CPUs in the system are full dynticks by default. - Note the boot CPU will still be kept outside the range to - handle the timekeeping duty. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75043046914e..10b7186d0638 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -50,6 +50,7 @@ #include <linux/export.h> #include <linux/hashtable.h> #include <linux/compat.h> +#include <linux/nospec.h> #include "timekeeping.h" #include "posix-timers.h" @@ -1346,11 +1347,15 @@ static const struct k_clock * const posix_clocks[] = { static const struct k_clock *clockid_to_kclock(const clockid_t id) { - if (id < 0) + clockid_t idx = id; + + if (id < 0) { return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; + } - if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + if (id >= ARRAY_SIZE(posix_clocks)) return NULL; - return posix_clocks[id]; + + return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ab92aa4442df..5d4a0342f934 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) return 0; } -static int tick_nohz_init_all(void) -{ - int err = -1; - -#ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - err = 0; - cpumask_setall(tick_nohz_full_mask); - tick_nohz_full_running = true; -#endif - return err; -} - void __init tick_nohz_init(void) { int cpu, ret; - if (!tick_nohz_full_running) { - if (tick_nohz_init_all() < 0) - return; - } + if (!tick_nohz_full_running) + return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c0a9e310d715..01e6b3a38871 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,7 +661,41 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); + return true; +} + +const struct bpf_verifier_ops tracepoint_verifier_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; @@ -678,8 +712,8 @@ clear: return err; } -static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { - .func = bpf_perf_prog_read_value_tp, +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { + .func = bpf_perf_prog_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -687,7 +721,7 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -695,34 +729,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; case BPF_FUNC_perf_prog_read_value: - return &bpf_perf_prog_read_value_proto_tp; + return &bpf_perf_prog_read_value_proto; default: return tracing_func_proto(func_id); } } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) - return false; - if (type != BPF_READ) - return false; - if (off % size != 0) - return false; - - BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); - return true; -} - -const struct bpf_verifier_ops tracepoint_verifier_ops = { - .get_func_proto = tp_prog_func_proto, - .is_valid_access = tp_prog_is_valid_access, -}; - -const struct bpf_prog_ops tracepoint_prog_ops = { -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -779,7 +791,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, } const struct bpf_verifier_ops perf_event_verifier_ops = { - .get_func_proto = tp_prog_func_proto, + .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 55d6dff37daf..2c416509b834 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kprobes.h> #include "trace.h" +#include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; @@ -237,6 +238,107 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_unlock(&event_mutex); } +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + return -ENOMEM; + ret = strncpy_from_user( + func, u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (ret < 0) + goto out; + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) + return -EINVAL; + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + ret = strncpy_from_user( + path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret < 0) + goto out; + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe( + path, p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1fad24acd444..1cd3fb4d70f8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) disable_kprobe(&tk->rp.kp); wait = 1; } + + /* + * if tk is not added to any list, it must be a local trace_kprobe + * created with perf_event_open. We don't need to wait for these + * trace_kprobes + */ + if (list_empty(&tk->list)) + wait = 0; out: if (wait) { /* @@ -659,7 +667,7 @@ static int create_trace_kprobe(int argc, char **argv) char *symbol = NULL, *event = NULL, *group = NULL; int maxactive = 0; char *arg; - unsigned long offset = 0; + long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -747,7 +755,7 @@ static int create_trace_kprobe(int argc, char **argv) symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); - if (ret) { + if (ret || offset < 0 || offset > UINT_MAX) { pr_info("Failed to parse either an address or a symbol.\n"); return ret; } @@ -1358,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_kprobe_event(struct trace_kprobe *tk) +static inline void init_trace_event_call(struct trace_kprobe *tk, + struct trace_event_call *call) { - struct trace_event_call *call = &tk->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1372,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } + + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; + call->data = tk; +} + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + struct trace_event_call *call = &tk->tp.call; + int ret = 0; + + init_trace_event_call(tk, call); + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1379,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = TRACE_EVENT_FL_KPROBE; - call->class->reg = kprobe_register; - call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", @@ -1403,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } +#ifdef CONFIG_PERF_EVENTS +/* create a trace_kprobe, but don't add it to global lists */ +struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return) +{ + struct trace_kprobe *tk; + int ret; + char *event; + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name here. + */ + event = func ? func : "DUMMY_EVENT"; + + tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, + offs, 0 /* maxactive */, 0 /* nargs */, + is_return); + + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return ERR_CAST(tk); + } + + init_trace_event_call(tk, &tk->tp.call); + + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ret = -ENOMEM; + goto error; + } + + ret = __register_trace_kprobe(tk); + if (ret < 0) + goto error; + + return &tk->tp.call; +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +void destroy_local_trace_kprobe(struct trace_event_call *event_call) +{ + struct trace_kprobe *tk; + + tk = container_of(event_call, struct trace_kprobe, tp.call); + + if (trace_probe_is_enabled(&tk->tp)) { + WARN_ON(1); + return; + } + + __unregister_trace_kprobe(tk); + free_trace_kprobe(tk); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d59357308677..daf54bda4dc8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type, } /* Split symbol and offset. */ -int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +int traceprobe_split_symbol_offset(char *symbol, long *offset) { char *tmp; int ret; @@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) if (!offset) return -EINVAL; - tmp = strchr(symbol, '+'); + tmp = strpbrk(symbol, "+-"); if (tmp) { - /* skip sign because kstrtoul doesn't accept '+' */ - ret = kstrtoul(tmp + 1, 0, offset); + ret = kstrtol(tmp, 0, offset); if (ret) return ret; - *tmp = '\0'; } else *offset = 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e101c5bb9eda..75daff22ccea 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name, extern void traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); -extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); +extern int traceprobe_split_symbol_offset(char *symbol, long *offset); /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int @@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, } extern int set_print_fmt(struct trace_probe *tp, bool is_return); + +#ifdef CONFIG_PERF_EVENTS +extern struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return); +extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); + +extern struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); +#endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 268029ae1be6..2014f4351ae0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static int register_uprobe_event(struct trace_uprobe *tu) +static inline void init_trace_event_call(struct trace_uprobe *tu, + struct trace_event_call *call) { - struct trace_event_call *call = &tu->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; + call->flags = TRACE_EVENT_FL_UPROBE; + call->class->reg = trace_uprobe_register; + call->data = tu; +} + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct trace_event_call *call = &tu->tp.call; + int ret = 0; + + init_trace_event_call(tu, call); + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; @@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - call->flags = TRACE_EVENT_FL_UPROBE; - call->class->reg = trace_uprobe_register; - call->data = tu; ret = trace_add_event_call(call); if (ret) { @@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) return 0; } +#ifdef CONFIG_PERF_EVENTS +struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +{ + struct trace_uprobe *tu; + struct inode *inode; + struct path path; + int ret; + + ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = igrab(d_inode(path.dentry)); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + iput(inode); + return ERR_PTR(-EINVAL); + } + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name "DUMMY_EVENT" here. + */ + tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, + is_return); + + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", + (int)PTR_ERR(tu)); + return ERR_CAST(tu); + } + + tu->offset = offs; + tu->inode = inode; + tu->filename = kstrdup(name, GFP_KERNEL); + init_trace_event_call(tu, &tu->tp.call); + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ret = -ENOMEM; + goto error; + } + + return &tu->tp.call; +error: + free_trace_uprobe(tu); + return ERR_PTR(ret); +} + +void destroy_local_trace_uprobe(struct trace_event_call *event_call) +{ + struct trace_uprobe *tu; + + tu = container_of(event_call, struct trace_uprobe, tp.call); + + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + + free_trace_uprobe(tu); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { |