summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_struct_ops.c7
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/stackmap.c3
-rw-r--r--kernel/cgroup/cgroup.c17
-rw-r--r--kernel/cgroup/cpuset.c56
-rw-r--r--kernel/events/core.c34
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/sched/debug.c8
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/workqueue.c18
10 files changed, 109 insertions, 44 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d6731c32864e..9abcc33f02cf 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
+ u32 flags;
moff = btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
@@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+ flags = st_ops->func_models[i].ret_size > 0 ?
+ BPF_TRAMP_F_RET_FENTRY_RET : 0;
err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- tprogs, NULL);
+ &st_ops->func_models[i],
+ flags, tprogs, NULL);
if (err < 0)
goto reset_unlock;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9f4636d021b1..d6b7dfdd8066 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -827,7 +827,7 @@ int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 09a3fd97d329..6e75bbee39f0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
- u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
int err;
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8afa8690d288..570b0c97392a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6574,22 +6574,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
+
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->cgroup = cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index df1ccf4558f8..2a9695ccb65f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -311,17 +311,19 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_rwsem write lock. Other
+ * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
+ * prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
+ * just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
@@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void)
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
@@ -435,7 +437,7 @@ out_unlock:
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
+ * are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
+ * cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_mutex held. */
+/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -726,7 +728,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_mutex held.
+ * Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes cpus_read_lock().
+ * Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void)
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_cpumask(struct cpuset *cs)
@@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
@@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_mutex */
+ static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
@@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
+ * the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_lock during call.
+ * Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
+ * function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1980,7 +1982,7 @@ out:
* cs: the cpuset to update
* new_prs: new partition root state
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
@@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
+ /* static buf protected by cpuset_rwsem */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
@@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
@@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0c000cb01eeb..f23ca260307f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ } else {
+ ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
+ group_update_userpage(event);
}
-
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
}
return 0;
@@ -6324,6 +6348,8 @@ accounting:
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
diff --git a/kernel/module.c b/kernel/module.c
index 40ec9a030eec..5c26a76e800b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4489,8 +4489,10 @@ static void cfi_init(struct module *mod)
/* Fix init/exit functions to point to the CFI jump table */
if (init)
mod->init = *init;
+#ifdef CONFIG_MODULE_UNLOAD
if (exit)
mod->exit = *exit;
+#endif
cfi_module_add(mod, module_addr_min);
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 49716228efb4..17a653b67006 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[16];
+ unsigned int scaling;
if (cnt > 15)
cnt = 15;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
- if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ if (kstrtouint(buf, 10, &scaling))
return -EINVAL;
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
if (sched_update_scaling())
return -EINVAL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f245b939..f6a05d9b5443 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4936,8 +4936,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
+ /* Nothing to run but something to decay (on_list)? Complete the branch */
+ if (!cfs_rq->load.weight) {
+ if (cfs_rq->on_list)
+ goto unthrottle_throttle;
return;
+ }
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33a6b4a2443d..1b3eb1e9531f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4830,8 +4830,16 @@ void show_workqueue_state(void)
for_each_pwq(pwq, wq) {
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works))
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
show_pwq(pwq);
+ printk_deferred_exit();
+ }
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
@@ -4849,7 +4857,12 @@ void show_workqueue_state(void)
raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
-
+ /*
+ * Defer printing to avoid deadlocks in console drivers that
+ * queue work while holding locks also taken in their write
+ * paths.
+ */
+ printk_deferred_enter();
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
pr_cont(" hung=%us workers=%d",
@@ -4864,6 +4877,7 @@ void show_workqueue_state(void)
first = false;
}
pr_cont("\n");
+ printk_deferred_exit();
next_pool:
raw_spin_unlock_irqrestore(&pool->lock, flags);
/*