summaryrefslogtreecommitdiff
path: root/kernel/cgroup/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r--kernel/cgroup/cpuset.c264
1 files changed, 164 insertions, 100 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58e6f18f01c1..58ec88efa4f8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
/*
* Percpu kthreads in top_cpuset are ignored
*/
- if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task))
+ if (kthread_is_per_cpu(task))
continue;
cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
} else {
@@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
- if (parent->nr_subparts_cpus) {
+ if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
cpumask_or(new_cpus, parent->effective_cpus,
parent->subparts_cpus);
cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
@@ -1277,6 +1277,52 @@ enum subparts_cmd {
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on);
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp);
+
+/*
+ * Update partition exclusive flag
+ *
+ * Return: 0 if successful, an error code otherwise
+ */
+static int update_partition_exclusive(struct cpuset *cs, int new_prs)
+{
+ bool exclusive = (new_prs > 0);
+
+ if (exclusive && !is_cpu_exclusive(cs)) {
+ if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ return PERR_NOTEXCL;
+ } else if (!exclusive && is_cpu_exclusive(cs)) {
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+ return 0;
+}
+
+/*
+ * Update partition load balance flag and/or rebuild sched domain
+ *
+ * Changing load balance flag will automatically call
+ * rebuild_sched_domains_locked().
+ */
+static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+{
+ int new_prs = cs->partition_root_state;
+ bool new_lb = (new_prs != PRS_ISOLATED);
+ bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+
+ if (new_lb != !!is_sched_load_balance(cs)) {
+ rebuild_domains = true;
+ if (new_lb)
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ }
+
+ if (rebuild_domains)
+ rebuild_sched_domains_locked();
+}
+
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cs: The cpuset that requests change in partition root state
@@ -1336,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
- if ((newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cs->cpus_allowed)))
+ if (!newmask && cpumask_empty(cs->cpus_allowed))
return PERR_CPUSEMPTY;
/*
@@ -1404,10 +1449,15 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
+ * Empty cpumask is not allowed
+ */
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ /*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
- if (adding &&
+ } else if (adding &&
cpumask_subset(parent->effective_cpus, tmp->addmask) &&
!cpumask_intersects(tmp->delmask, cpu_active_mask) &&
partition_is_populated(parent, cs)) {
@@ -1480,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
/*
* Transitioning between invalid to valid or vice versa may require
- * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ * changing CS_CPU_EXCLUSIVE.
*/
if (old_prs != new_prs) {
- if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
- (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
- return PERR_NOTEXCL;
- if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ int err = update_partition_exclusive(cs, new_prs);
+
+ if (err)
+ return err;
}
/*
@@ -1520,24 +1569,34 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
- if (adding || deleting)
+ if (adding || deleting) {
update_tasks_cpumask(parent, tmp->addmask);
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, tmp);
+ }
/*
- * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
- * rebuild_sched_domains_locked() may be called.
+ * For partcmd_update without newmask, it is being called from
+ * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
+ * Update the load balance flag and scheduling domain if
+ * cpus_read_trylock() is successful.
*/
- if (old_prs != new_prs) {
- if (old_prs == PRS_ISOLATED)
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
- else if (new_prs == PRS_ISOLATED)
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+ update_partition_sd_lb(cs, old_prs);
+ cpus_read_unlock();
}
+
notify_partition_change(cs, old_prs);
return 0;
}
/*
+ * update_cpumasks_hier() flags
+ */
+#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
+#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
+
+/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
@@ -1551,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
* Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
- bool force)
+ int flags)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -1588,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
}
/*
- * Skip the whole subtree if the cpumask remains the same
- * and has no partition root state and force flag not set.
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+ * 3) HIER_CHECKALL flag not set, and
+ * 4) for v2 load balance state same as its parent.
*/
- if (!cp->partition_root_state && !force &&
- cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
+ if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1676,6 +1740,20 @@ update_parent_subparts:
update_tasks_cpumask(cp, tmp->new_cpus);
/*
+ * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ }
+
+ /*
* On legacy hierarchy, if the effective cpumask of any non-
* empty cpuset is changed, we need to rebuild sched domains.
* On default hierarchy, the cpuset needs to be a partition
@@ -1692,7 +1770,7 @@ update_parent_subparts:
}
rcu_read_unlock();
- if (need_rebuild_sched_domains)
+ if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
rebuild_sched_domains_locked();
}
@@ -1716,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* to use the right effective_cpus value.
*
* The update_cpumasks_hier() function may sleep. So we have to
- * release the RCU read lock before calling it.
+ * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
+ * flag is used to suppress rebuild of sched domains as the callers
+ * will take care of that.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1728,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp, false);
+ update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -1747,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
struct tmpmasks tmp;
bool invalidate = false;
+ int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1774,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
-#ifdef CONFIG_CPUMASK_OFFSTACK
- /*
- * Use the cpumasks in trialcs for tmpmasks when they are pointers
- * to allocated cpumasks.
- *
- * Note that update_parent_subparts_cpumask() uses only addmask &
- * delmask, but not new_cpus.
- */
- tmp.addmask = trialcs->subparts_cpus;
- tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = NULL;
-#endif
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
retval = validate_change(cs, trialcs);
@@ -1814,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
retval = 0;
}
if (retval < 0)
- return retval;
+ goto out_free;
if (cs->partition_root_state) {
if (invalidate)
@@ -1849,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
spin_unlock_irq(&callback_lock);
-#ifdef CONFIG_CPUMASK_OFFSTACK
- /* Now trialcs->cpus_allowed is available */
- tmp.new_cpus = trialcs->cpus_allowed;
-#endif
-
/* effective_cpus will be updated here */
- update_cpumasks_hier(cs, &tmp, false);
+ update_cpumasks_hier(cs, &tmp, 0);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -1866,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmp);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
+ update_partition_sd_lb(cs, old_prs);
}
+out_free:
+ free_cpumasks(NULL, &tmp);
return 0;
}
@@ -2242,7 +2313,6 @@ out:
static int update_prstate(struct cpuset *cs, int new_prs)
{
int err = PERR_NONE, old_prs = cs->partition_root_state;
- bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
@@ -2261,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs)
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
+ err = update_partition_exclusive(cs, new_prs);
+ if (err)
+ goto out;
+
if (!old_prs) {
/*
- * Turning on partition root requires setting the
- * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be empty.
+ * cpus_allowed cannot be empty.
*/
if (cpumask_empty(cs->cpus_allowed)) {
err = PERR_CPUSEMPTY;
goto out;
}
- err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err) {
- err = PERR_NOTEXCL;
- goto out;
- }
-
err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
- if (err) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- goto out;
- }
-
- if (new_prs == PRS_ISOLATED) {
- /*
- * Disable the load balance flag should not return an
- * error unless the system is running out of memory.
- */
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- sched_domain_rebuilt = true;
- }
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
- update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
- sched_domain_rebuilt = true;
- goto out; /* Sched domain is rebuilt in update_flag() */
+ ;
} else {
/*
* Switching back to member is always allowed even if it
@@ -2318,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs)
compute_effective_cpumask(cs->effective_cpus, cs, parent);
spin_unlock_irq(&callback_lock);
}
-
- /* Turning off CS_CPU_EXCLUSIVE will not return error */
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
-
- if (!is_sched_load_balance(cs)) {
- /* Make sure load balance is on */
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
- sched_domain_rebuilt = true;
- }
}
-
- update_tasks_cpumask(parent, tmpmask.new_cpus);
-
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmpmask);
-
- if (!sched_domain_rebuilt)
- rebuild_sched_domains_locked();
out:
/*
- * Make partition invalid if an error happen
+ * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
+ * happens.
*/
- if (err)
+ if (err) {
new_prs = -new_prs;
+ update_partition_exclusive(cs, new_prs);
+ }
+
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
+
/*
* Update child cpusets, if present.
* Force update if switching back to member.
*/
if (!list_empty(&cs->css.children))
- update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
free_cpumasks(NULL, &tmpmask);
@@ -2487,6 +2529,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
+ bool cpus_updated, mems_updated;
int ret;
/* used later by cpuset_attach() */
@@ -2501,13 +2544,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (ret)
goto out_unlock;
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
- ret = security_task_setscheduler(task);
- if (ret)
- goto out_unlock;
+
+ /*
+ * Skip rights over task check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
if (dl_task(task)) {
cs->nr_migrate_dl_tasks++;
@@ -3222,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
+
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -3521,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- mutex_unlock(&cpuset_mutex);
-
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
*/
- if (is_empty)
+ if (is_empty) {
+ mutex_unlock(&cpuset_mutex);
remove_tasks_in_empty_cpuset(cs);
-
- mutex_lock(&cpuset_mutex);
+ mutex_lock(&cpuset_mutex);
+ }
}
static void
@@ -3691,6 +3753,7 @@ unlock:
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * @work: unused
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -4073,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
/**
* cpuset_spread_node() - On which node to begin search for a page
+ * @rotor: round robin rotor
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),