summaryrefslogtreecommitdiff
path: root/kernel/cgroup/rstat.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-09 07:04:02 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-09 07:04:02 +0300
commit9f8413c4a66f2fb776d3dc3c9ed20bf435eb305e (patch)
tree741716da34b378c6ac0a62535c38f6ed9bb974d2 /kernel/cgroup/rstat.c
parentbfe8eb3b85c571f7e94e1039f59b462505b8e0fc (diff)
parenta7fb0423c201ba12815877a0b5a68a6a1710b23a (diff)
downloadlinux-9f8413c4a66f2fb776d3dc3c9ed20bf435eb305e.tar.xz
Merge tag 'cgroup-for-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Yafang Shao added task_get_cgroup1() helper to enable a similar BPF helper so that BPF progs can be more useful on cgroup1 hierarchies. While cgroup1 is mostly in maintenance mode, this addition is very small while having an outsized usefulness for users who are still on cgroup1. Yafang also optimized root cgroup list access by making it RCU protected in the process. - Waiman Long optimized rstat operation leading to substantially lower and more consistent lock hold time while flushing the hierarchical statistics. As the lock can be acquired briefly in various hot paths, this reduction has cascading benefits. - Waiman also improved the quality of isolation for cpuset's isolated partitions. CPUs which are allocated to isolated partitions are now excluded from running unbound work items and cpu_is_isolated() test which is used by vmstat and memcg to reduce interference now includes cpuset isolated CPUs. While it isn't there yet, the hope is eventually reaching parity with the isolation level provided by the `isolcpus` boot param but in a dynamic manner. * tag 'cgroup-for-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: Move rcu_head up near the top of cgroup_root cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated() check cgroup: Avoid false cacheline sharing of read mostly rstat_cpu cgroup/rstat: Optimize cgroup_rstat_updated_list() cgroup: Fix documentation for cpu.idle cgroup/cpuset: Expose cpuset.cpus.isolated workqueue: Move workqueue_set_unbound_cpumask() and its helpers inside CONFIG_SYSFS cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked() cgroup/cpuset: Take isolated CPUs out of workqueue unbound cpumask cgroup/cpuset: Keep track of CPUs in isolated partitions selftests/cgroup: Minor code cleanup and reorganization of test_cpuset_prs.sh workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask selftests: cgroup: Fixes a typo in a comment cgroup: Add a new helper for cgroup1 hierarchy cgroup: Add annotation for holding namespace_sem in current_cgns_cgroup_from_root() cgroup: Eliminate the need for cgroup_mutex in proc_cgroup_show() cgroup: Make operations on the cgroup root_list RCU safe cgroup: Remove unnecessary list_empty()
Diffstat (limited to 'kernel/cgroup/rstat.c')
-rw-r--r--kernel/cgroup/rstat.c150
1 files changed, 96 insertions, 54 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c0adb7254b45..a8350d2d63e6 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
}
/**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * cgroup_rstat_push_children - push children cgroups into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
* @cpu: target cpu
+ * Return: A new singly linked list of cgroups to be flush
*
- * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_rstat_cpu_lock held.
+ * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list built from the tail backward like "pushing"
+ * cgroups into a stack. The root is pushed by the caller.
+ */
+static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
+ struct cgroup *child, int cpu)
+{
+ struct cgroup *chead = child; /* Head of child cgroup level */
+ struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
+ struct cgroup *parent, *grandchild;
+ struct cgroup_rstat_cpu *crstatc;
+
+ child->rstat_flush_next = NULL;
+
+next_level:
+ while (chead) {
+ child = chead;
+ chead = child->rstat_flush_next;
+ parent = cgroup_parent(child);
+
+ /* updated_next is parent cgroup terminated */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = cgroup_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
+ }
+
+ if (ghead) {
+ chead = ghead;
+ ghead = NULL;
+ goto next_level;
+ }
+ return head;
+}
+
+/**
+ * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
+ * @root: root of the cgroup subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of cgroups to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned cgroup is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child cgroups if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent cgroup. An exception
+ * here is the cgroup root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
- struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
-
- if (pos == root)
- return NULL;
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
+ struct cgroup *head = NULL, *parent, *child;
+ unsigned long flags;
/*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
*/
- if (!pos) {
- pos = root;
- /* return NULL if this subtree is not on-list */
- if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
- return NULL;
- } else {
- pos = cgroup_parent(pos);
- }
+ raw_spin_lock_irqsave(cpu_lock, flags);
- /* walk down to the first leaf */
- while (true) {
- rstatc = cgroup_rstat_cpu(pos, cpu);
- if (rstatc->updated_children == pos)
- break;
- pos = rstatc->updated_children;
- }
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ goto unlock_ret;
/*
- * Unlink @pos from the tree. As the updated_children list is
+ * Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
+ parent = cgroup_parent(root);
if (parent) {
struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (*nextp != pos) {
+ while (*nextp != root) {
struct cgroup_rstat_cpu *nrstatc;
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
@@ -142,7 +187,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
}
rstatc->updated_next = NULL;
- return pos;
+
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = cgroup_rstat_push_children(head, child, cpu);
+unlock_ret:
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+ return head;
}
/*
@@ -176,21 +231,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_rstat_lock);
for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
- cpu);
- struct cgroup *pos = NULL;
- unsigned long flags;
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ for (; pos; pos = pos->rstat_flush_next) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
@@ -202,7 +245,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {