From f279294b329363eb6ada568e494d609ef78e3e8e Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 17 Sep 2021 20:44:14 +0800 Subject: misc_cgroup: introduce misc.events to count failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce misc.events to make it easier for us to understand the pressure of resources. Currently only the 'max' event is implemented, which indicates the times the resource is about to exceeds the max limit. Signed-off-by: Chunguang Xu Reviewed-by: Vipin Sharma Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 5 +++++ kernel/cgroup/misc.c | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index da2367e2ac1e..091f2d2a1aec 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -36,6 +36,7 @@ struct misc_cg; struct misc_res { unsigned long max; atomic_long_t usage; + atomic_long_t events; bool failed; }; @@ -46,6 +47,10 @@ struct misc_res { */ struct misc_cg { struct cgroup_subsys_state css; + + /* misc.events */ + struct cgroup_file events_file; + struct misc_res res[MISC_CG_RES_TYPES]; }; diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ec02d963cad1..4b2b49267384 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -171,6 +171,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, return 0; err_charge: + for (j = i; j; j = parent_misc(j)) { + atomic_long_inc(&j->res[type].events); + cgroup_file_notify(&j->events_file); + } + for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); misc_cg_cancel_charge(type, i, amount); @@ -335,6 +340,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + struct misc_cg *cg = css_misc(seq_css(sf)); + unsigned long events, i; + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + events = atomic_long_read(&cg->res[i].events); + if (READ_ONCE(misc_res_capacity[i]) || events) + seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + } + return 0; +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -353,6 +371,12 @@ static struct cftype misc_cg_files[] = { .seq_show = misc_cg_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_file), + .seq_show = misc_events_show, + }, {} }; -- cgit v1.2.3 From b03357528fd9516600ff358ab5d4b887b8cb80e8 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 17 Sep 2021 20:44:15 +0800 Subject: misc_cgroup: remove error log to avoid log flood MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In scenarios where containers are frequently created and deleted, a large number of error logs maybe generated. The logs only show which node is about to go over the max limit, not the node which resource request failed. As misc.events has provided relevant information, maybe we can remove this log. Signed-off-by: Chunguang Xu Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 1 - kernel/cgroup/misc.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 091f2d2a1aec..c238207d1615 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -37,7 +37,6 @@ struct misc_res { unsigned long max; atomic_long_t usage; atomic_long_t events; - bool failed; }; /** diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 4b2b49267384..fe3e8a0eb7ed 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, new_usage = atomic_long_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { - if (!res->failed) { - pr_info("cgroup: charge rejected by the misc controller for %s resource in ", - misc_res_name[type]); - pr_cont_cgroup_path(i->css.cgroup); - pr_cont("\n"); - res->failed = true; - } ret = -EBUSY; goto err_charge; } -- cgit v1.2.3 From 4b53bb873fcd578a1075b79e7bea0143b4e1ca67 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 17 Sep 2021 20:44:16 +0800 Subject: docs/cgroup: add entry for misc.events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added descriptions of misc.events. Signed-off-by: Chunguang Xu Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4d8c27eca96b..6658598c4d71 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2310,6 +2310,16 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_ Limits can be set higher than the capacity value in the misc.capacity file. + misc.events + A read-only flat-keyed file which exists on non-root cgroups. The + following entries are defined. Unless specified otherwise, a value + change in this file generates a file modified event. All fields in + this file are hierarchical. + + max + The number of times the cgroup's resource usage was + about to go over the max boundary. + Migration and Ownership ~~~~~~~~~~~~~~~~~~~~~~~ -- cgit v1.2.3 From 7ee285395b211cad474b2b989db52666e0430daf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 18 Sep 2021 18:53:08 -0400 Subject: cgroup: Make rebind_subsystems() disable v2 controllers all at once It was found that the following warning was displayed when remounting controllers from cgroup v2 to v1: [ 8042.997778] WARNING: CPU: 88 PID: 80682 at kernel/cgroup/cgroup.c:3130 cgroup_apply_control_disable+0x158/0x190 : [ 8043.091109] RIP: 0010:cgroup_apply_control_disable+0x158/0x190 [ 8043.096946] Code: ff f6 45 54 01 74 39 48 8d 7d 10 48 c7 c6 e0 46 5a a4 e8 7b 67 33 00 e9 41 ff ff ff 49 8b 84 24 e8 01 00 00 0f b7 40 08 eb 95 <0f> 0b e9 5f ff ff ff 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 [ 8043.115692] RSP: 0018:ffffba8a47c23d28 EFLAGS: 00010202 [ 8043.120916] RAX: 0000000000000036 RBX: ffffffffa624ce40 RCX: 000000000000181a [ 8043.128047] RDX: ffffffffa63c43e0 RSI: ffffffffa63c43e0 RDI: ffff9d7284ee1000 [ 8043.135180] RBP: ffff9d72874c5800 R08: ffffffffa624b090 R09: 0000000000000004 [ 8043.142314] R10: ffffffffa624b080 R11: 0000000000002000 R12: ffff9d7284ee1000 [ 8043.149447] R13: ffff9d7284ee1000 R14: ffffffffa624ce70 R15: ffffffffa6269e20 [ 8043.156576] FS: 00007f7747cff740(0000) GS:ffff9d7a5fc00000(0000) knlGS:0000000000000000 [ 8043.164663] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8043.170409] CR2: 00007f7747e96680 CR3: 0000000887d60001 CR4: 00000000007706e0 [ 8043.177539] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8043.184673] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8043.191804] PKRU: 55555554 [ 8043.194517] Call Trace: [ 8043.196970] rebind_subsystems+0x18c/0x470 [ 8043.201070] cgroup_setup_root+0x16c/0x2f0 [ 8043.205177] cgroup1_root_to_use+0x204/0x2a0 [ 8043.209456] cgroup1_get_tree+0x3e/0x120 [ 8043.213384] vfs_get_tree+0x22/0xb0 [ 8043.216883] do_new_mount+0x176/0x2d0 [ 8043.220550] __x64_sys_mount+0x103/0x140 [ 8043.224474] do_syscall_64+0x38/0x90 [ 8043.228063] entry_SYSCALL_64_after_hwframe+0x44/0xae It was caused by the fact that rebind_subsystem() disables controllers to be rebound one by one. If more than one disabled controllers are originally from the default hierarchy, it means that cgroup_apply_control_disable() will be called multiple times for the same default hierarchy. A controller may be killed by css_kill() in the first round. In the second round, the killed controller may not be completely dead yet leading to the warning. To avoid this problem, we collect all the ssid's of controllers that needed to be disabled from the default hierarchy and then disable them in one go instead of one by one. Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..2e98db4558f2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1740,6 +1740,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1756,8 +1757,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1766,10 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); -- cgit v1.2.3 From 0061270307f2ad5fa01db1ac5f1da0e83410adaf Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Mon, 4 Oct 2021 20:19:48 +0000 Subject: cgroup: cgroup-v1: do not exclude cgrp_dfl_root Found an issue within cgroup_attach_task_all() fn which seem to exclude cgrp_dfl_root (cgroupv2) while attaching tasks to the given cgroup. This was noticed when the system was running qemu/kvm with kernel vhost helper threads. It appears that the vhost layer which uses cgroup_attach_task_all() fn to assign the vhost kthread to the right qemu cgroup works fine with cgroupv1 based configuration but not in cgroupv2. With cgroupv2, the vhost helper thread ends up just belonging to the root cgroup as is shown below: $ stat -fc %T /sys/fs/cgroup/ cgroup2fs $ sudo pgrep qemu 1916421 $ ps -eL | grep 1916421 1916421 1916421 ? 00:00:01 qemu-system-x86 1916421 1916431 ? 00:00:00 call_rcu 1916421 1916435 ? 00:00:00 IO mon_iothread 1916421 1916436 ? 00:00:34 CPU 0/KVM 1916421 1916439 ? 00:00:00 SPICE Worker 1916421 1916440 ? 00:00:00 vnc_worker 1916433 1916433 ? 00:00:00 vhost-1916421 1916437 1916437 ? 00:00:00 kvm-pit/1916421 $ cat /proc/1916421/cgroup 0::/machine.slice/machine-qemu\x2d18\x2dDroplet\x2d7572850.scope/emulator $ cat /proc/1916439/cgroup 0::/machine.slice/machine-qemu\x2d18\x2dDroplet\x2d7572850.scope/emulator $ cat /proc/1916433/cgroup 0::/ From above, it can be seen that the vhost kthread (PID: 1916433) doesn't seem to belong the qemu cgroup like other qemu PIDs. After applying this patch: $ pgrep qemu 1643 $ ps -eL | grep 1643 1643 1643 ? 00:00:00 qemu-system-x86 1643 1645 ? 00:00:00 call_rcu 1643 1648 ? 00:00:00 IO mon_iothread 1643 1649 ? 00:00:00 CPU 0/KVM 1643 1652 ? 00:00:00 SPICE Worker 1643 1653 ? 00:00:00 vnc_worker 1647 1647 ? 00:00:00 vhost-1643 1651 1651 ? 00:00:00 kvm-pit/1643 $ cat /proc/1647/cgroup 0::/machine.slice/machine-qemu\x2d18\x2dDroplet\x2d7572850.scope/emulator Suggested-by: Tejun Heo Signed-off-by: Vishal Verma Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 35b920328344..f6cc5f8484dc 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) for_each_root(root) { struct cgroup *from_cgrp; - if (root == &cgrp_dfl_root) - continue; - spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); -- cgit v1.2.3 From be288169712f3dea0bc6b50c00b3ab53d85f1435 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 24 Oct 2021 23:19:14 -0700 Subject: cgroup: reduce dependency on cgroup_mutex Currently cgroup_get_from_path() and cgroup_get_from_id() grab cgroup_mutex before traversing the default hierarchy to find the kernfs_node corresponding to the path/id and then extract the linked cgroup. Since cgroup_mutex is still held, it is guaranteed that the cgroup will be alive and the reference can be taken on it. However similar guarantee can be provided without depending on the cgroup_mutex and potentially reducing avenues of cgroup_mutex contentions. The kernfs_node's priv pointer is RCU protected pointer and with just rcu read lock we can grab the reference on the cgroup without cgroup_mutex. So, remove cgroup_mutex from them. Signed-off-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 51 ++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2e98db4558f2..003204c85893 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5932,17 +5932,20 @@ struct cgroup *cgroup_get_from_id(u64 id) struct kernfs_node *kn; struct cgroup *cgrp = NULL; - mutex_lock(&cgroup_mutex); kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) - goto out_unlock; + goto out; + + rcu_read_lock(); - cgrp = kn->priv; - if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp)) + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; + + rcu_read_unlock(); + kernfs_put(kn); -out_unlock: - mutex_unlock(&cgroup_mutex); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); @@ -6495,30 +6498,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on - * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR) - * if @path points to a non-directory. + * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already + * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; - struct cgroup *cgrp; - - mutex_lock(&cgroup_mutex); + struct cgroup *cgrp = ERR_PTR(-ENOENT); kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); - if (kn) { - if (kernfs_type(kn) == KERNFS_DIR) { - cgrp = kn->priv; - cgroup_get_live(cgrp); - } else { - cgrp = ERR_PTR(-ENOTDIR); - } - kernfs_put(kn); - } else { - cgrp = ERR_PTR(-ENOENT); + if (!kn) + goto out; + + if (kernfs_type(kn) != KERNFS_DIR) { + cgrp = ERR_PTR(-ENOTDIR); + goto out_kernfs; } - mutex_unlock(&cgroup_mutex); + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + +out_kernfs: + kernfs_put(kn); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); -- cgit v1.2.3 From bb758421416fd8af2267a8e07d53fb8328b07c4e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 24 Oct 2021 23:19:15 -0700 Subject: cgroup: remove cgroup_mutex from cgroupstats_build The function cgroupstats_build extracts cgroup from the kernfs_node's priv pointer which is a RCU pointer. So, there is no need to grab cgroup_mutex. Just get the reference on the cgroup before using and remove the cgroup_mutex altogether. Signed-off-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f6cc5f8484dc..fd14a60379c1 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -698,8 +698,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) kernfs_type(kn) != KERNFS_DIR) return -EINVAL; - mutex_lock(&cgroup_mutex); - /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), @@ -707,9 +705,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { + if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); return -ENOENT; } rcu_read_unlock(); @@ -737,7 +734,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); - mutex_unlock(&cgroup_mutex); + cgroup_put(cgrp); return 0; } -- cgit v1.2.3 From 822bc9bac9e9a2f76a772a34f745962dfc223353 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 24 Oct 2021 23:19:16 -0700 Subject: cgroup: no need for cgroup_mutex for /proc/cgroups On the real systems, the cgroups hierarchies are setup early and just once by the node controller, so, other than number of cgroups, all information in /proc/cgroups remain same for the system uptime. Let's remove the cgroup_mutex usage on reading /proc/cgroups. There is a chance of inconsistent number of cgroups for co-mounted cgroups while printing the information from /proc/cgroups but that is not a big issue. In addition /proc/cgroups is a v1 specific interface, so the dependency on it should reduce over time. The main motivation for removing the cgroup_mutex from /proc/cgroups is to reduce the avenues of its contention. On our fleet, we have observed buggy application hammering on /proc/cgroups and drastically slowing down the node controller on the system which have many negative consequences on other workloads running on the system. Signed-off-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fd14a60379c1..81c9e0685948 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -659,11 +659,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. + * Grab the subsystems state racily. No need to add avenue to + * cgroup_mutex contention. */ - mutex_lock(&cgroup_mutex); for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", @@ -671,7 +669,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); - mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3 From 81c49d39aea8a10e6d05d3aa1cb65ceb721e19b0 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Thu, 28 Oct 2021 15:15:27 -0700 Subject: cgroup: Fix rootcg cpu.stat guest double counting In account_guest_time in kernel/sched/cputime.c guest time is attributed to both CPUTIME_NICE and CPUTIME_USER in addition to CPUTIME_GUEST_NICE and CPUTIME_GUEST respectively. Therefore, adding both to calculate usage results in double counting any guest time at the rootcg. Fixes: 936f2a70f207 ("cgroup: add cpu.stat file to root cgroup") Signed-off-by: Dan Schatzberg Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b264ab5652ba..1486768f2318 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -433,8 +433,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; } } -- cgit v1.2.3 From 588e5d8766486e52ee332a4bb097b016a355b465 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Fri, 29 Oct 2021 02:39:06 +0000 Subject: cgroup: bpf: Move wrapper for __cgroup_bpf_*() to kernel/bpf/cgroup.c In commit 324bda9e6c5a("bpf: multi program support for cgroup+bpf") cgroup_bpf_*() called from kernel/bpf/syscall.c, but now they are only used in kernel/bpf/cgroup.c, so move these function to kernel/bpf/cgroup.c, like cgroup_bpf_replace(). Signed-off-by: He Fengqing Signed-off-by: Tejun Heo --- include/linux/bpf-cgroup.h | 20 ----------------- kernel/bpf/cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++-------- kernel/cgroup/cgroup.c | 38 -------------------------------- 3 files changed, 45 insertions(+), 67 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2746fd804216..9aad4e3ca29b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -157,26 +157,6 @@ struct cgroup_bpf { int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); -int __cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags); -int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type); -int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr); - -/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ -int cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags); -int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type); -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr); - int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 03145d45e3d5..2ca643af9a54 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -430,10 +430,10 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags) +static int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, + enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; @@ -523,6 +523,20 @@ cleanup: return err; } +static int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, + enum bpf_attach_type type, + u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} + /* Swap updated BPF program for given link in effective program arrays across * all descendant cgroups. This function is guaranteed to succeed. */ @@ -672,14 +686,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL - * @prog: A link to detach or NULL + * @link: A link to detach or NULL * @type: Type of detach operation * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type) +static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; @@ -730,9 +744,20 @@ cleanup: return err; } +static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); + mutex_unlock(&cgroup_mutex); + return ret; +} + /* Must be called with cgroup_mutex held to avoid races. */ -int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) +static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; @@ -789,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); + mutex_unlock(&cgroup_mutex); + return ret; +} + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 003204c85893..c73f634c8328 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6676,44 +6676,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ -#ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, - u32 flags) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_query(cgrp, attr, uattr); - mutex_unlock(&cgroup_mutex); - return ret; -} -#endif /* CONFIG_CGROUP_BPF */ - #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) -- cgit v1.2.3