From af6363374cbda5007e46efa99f7346efd4eea5fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: make CONFIG_CGROUP_NET_PRIO bool and drop unnecessary init_netclassid_cgroup() net_prio is the only cgroup which is allowed to be built as a module. The savings from allowing one controller to be built as a module are tiny especially given that cgroup module support itself adds quite a bit of complexity. Given that none of other controllers has much chance of being made a module and that we're unlikely to add new modular controllers, the added complexity is simply not justifiable. As a first step to drop cgroup module support, this patch changes the config option to bool from tristate and drops module related code from it. Also, while an earlier commit fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core") dropped module support from net_cls cgroup, it retained a call to cgroup_load_subsys(), which is noop for built-in controllers. Drop it along with init_netclassid_cgroup(). v2: Removed modular version of task_netprioidx() in include/net/netprio_cgroup.h as suggested by Li Zefan. v3: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). net_cls cgroup part is mostly dropped except for removal of init_netclassid_cgroup(). Signed-off-by: Tejun Heo Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: Li Zefan Cc: Thomas Graf --- include/net/netprio_cgroup.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index dafc09f0fdbc..b7ff5bd3c3c3 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -27,7 +27,6 @@ struct netprio_map { void sock_update_netprioidx(struct sock *sk); -#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO) static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; @@ -39,20 +38,6 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_unlock(); return idx; } -#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO) -static inline u32 task_netprioidx(struct task_struct *p) -{ - struct cgroup_subsys_state *css; - u32 idx = 0; - - rcu_read_lock(); - css = task_css(p, net_prio_subsys_id); - if (css) - idx = css->cgroup->id; - rcu_read_unlock(); - return idx; -} -#endif #else /* !CONFIG_CGROUP_NET_PRIO */ static inline u32 task_netprioidx(struct task_struct *p) { -- cgit v1.2.3 From 3ed80a62bf959d34ebd4d553b026fbe7e6fbcc54 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: drop module support With module supported dropped from net_prio, no controller is using cgroup module support. None of actual resource controllers can be built as a module and we aren't gonna add new controllers which don't control resources. This patch drops module support from cgroup. * cgroup_[un]load_subsys() and cgroup_subsys->module removed. * As there's no point in distinguishing IS_BUILTIN() and IS_MODULE(), cgroup_subsys.h now uses IS_ENABLED() directly. * enum cgroup_subsys_id now exactly matches the list of enabled controllers as ordered in cgroup_subsys.h. * cgroup_subsys[] is now a contiguously occupied array. Size specification is no longer necessary and dropped. * for_each_builtin_subsys() is removed and for_each_subsys() is updated to not require any locking. * module ref handling is removed from rebind_subsystems(). * Module related comments dropped. v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). v3: Added {} around the if (need_forkexit_callback) block in cgroup_post_fork() for readability as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.c | 1 - include/linux/cgroup.h | 29 +---- include/linux/cgroup_subsys.h | 24 ++-- kernel/cgroup.c | 284 +++--------------------------------------- net/core/netclassid_cgroup.c | 1 - net/core/netprio_cgroup.c | 1 - 6 files changed, 32 insertions(+), 308 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4e491d9b5292..660d419918a7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -914,7 +914,6 @@ struct cgroup_subsys blkio_subsys = { .can_attach = blkcg_can_attach, .subsys_id = blkio_subsys_id, .base_cftypes = blkcg_files, - .module = THIS_MODULE, }; EXPORT_SYMBOL_GPL(blkio_subsys); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5c097596104b..d842a737d448 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -37,28 +37,13 @@ extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); -extern int cgroup_load_subsys(struct cgroup_subsys *ss); -extern void cgroup_unload_subsys(struct cgroup_subsys *ss); extern int proc_cgroup_show(struct seq_file *, void *); -/* - * Define the enumeration of all cgroup subsystems. - * - * We define ids for builtin subsystems and then modular ones. - */ +/* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _subsys_id, enum cgroup_subsys_id { -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) #include -#undef IS_SUBSYS_ENABLED - CGROUP_BUILTIN_SUBSYS_COUNT, - - __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1, - -#define IS_SUBSYS_ENABLED(option) IS_MODULE(option) -#include -#undef IS_SUBSYS_ENABLED CGROUP_SUBSYS_COUNT, }; #undef SUBSYS @@ -370,10 +355,9 @@ struct css_set { struct list_head cgrp_links; /* - * Set of subsystem states, one for each subsystem. This array - * is immutable after creation apart from the init_css_set - * during subsystem registration (at boot time) and modular subsystem - * loading/unloading. + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; @@ -620,15 +604,10 @@ struct cgroup_subsys { /* base cftypes, automatically [de]registered with subsys itself */ struct cftype *base_cftypes; struct cftype_set base_cftset; - - /* should be defined only by modular subsystems */ - struct module *module; }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) #include -#undef IS_SUBSYS_ENABLED #undef SUBSYS /** diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 7b99d717411d..11c42f6a25a8 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -3,51 +3,51 @@ * * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ -#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS) +#if IS_ENABLED(CONFIG_CPUSETS) SUBSYS(cpuset) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG) +#if IS_ENABLED(CONFIG_CGROUP_DEBUG) SUBSYS(debug) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED) +#if IS_ENABLED(CONFIG_CGROUP_SCHED) SUBSYS(cpu_cgroup) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT) +#if IS_ENABLED(CONFIG_CGROUP_CPUACCT) SUBSYS(cpuacct) #endif -#if IS_SUBSYS_ENABLED(CONFIG_MEMCG) +#if IS_ENABLED(CONFIG_MEMCG) SUBSYS(mem_cgroup) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE) +#if IS_ENABLED(CONFIG_CGROUP_DEVICE) SUBSYS(devices) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER) +#if IS_ENABLED(CONFIG_CGROUP_FREEZER) SUBSYS(freezer) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID) +#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) SUBSYS(net_cls) #endif -#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP) +#if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(blkio) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) +#if IS_ENABLED(CONFIG_CGROUP_PERF) SUBSYS(perf) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) SUBSYS(net_prio) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB) +#if IS_ENABLED(CONFIG_CGROUP_HUGETLB) SUBSYS(hugetlb) #endif /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2f46ba37f72..ccb16b47e293 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -120,15 +119,9 @@ static struct workqueue_struct *cgroup_destroy_wq; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated with the built in subsystems, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. - */ +/* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { +static struct cgroup_subsys *cgroup_subsys[] = { #include }; @@ -258,30 +251,13 @@ static int notify_on_release(const struct cgroup *cgrp) else /** - * for_each_subsys - iterate all loaded cgroup subsystems + * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * - * Iterates through all loaded subsystems. Should be called under - * cgroup_mutex or cgroup_root_mutex. */ #define for_each_subsys(ss, ssid) \ - for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ - (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((ss) = cgroup_subsys[(ssid)])) { } \ - else - -/** - * for_each_builtin_subsys - iterate all built-in cgroup subsystems - * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end - * - * Bulit-in subsystems are always present and iteration itself doesn't - * require any synchronization. - */ -#define for_each_builtin_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[i]) || true); (i)++) + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /* iterate across the active hierarchies */ #define for_each_active_root(root) \ @@ -975,50 +951,24 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - unsigned long pinned = 0; int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) { - if (!(added_mask & (1 << i))) - continue; - - /* is the subsystem mounted elsewhere? */ - if (ss->root != &cgroup_dummy_root) { - ret = -EBUSY; - goto out_put; - } - - /* pin the module */ - if (!try_module_get(ss->module)) { - ret = -ENOENT; - goto out_put; - } - pinned |= 1 << i; - } - - /* subsys could be missing if unloaded between parsing and here */ - if (added_mask != pinned) { - ret = -ENOENT; - goto out_put; - } + for_each_subsys(ss, i) + if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root) + return -EBUSY; ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - goto out_put; + return ret; /* * Nothing can fail from this point on. Remove files for the @@ -1057,9 +1007,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - - /* subsystem is now free - drop reference on module */ - module_put(ss->module); root->subsys_mask &= ~bit; } } @@ -1071,12 +1018,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; - -out_put: - for_each_subsys(ss, i) - if (pinned & (1 << i)) - module_put(ss->module); - return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -4506,7 +4447,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) return ret; } -static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) +static void __init cgroup_init_cftsets(struct cgroup_subsys *ss) { INIT_LIST_HEAD(&ss->cftsets); @@ -4559,185 +4500,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); } -/** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - int i, ret; - struct hlist_node *tmp; - struct css_set *cset; - unsigned long key; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->css_alloc == NULL || ss->css_free == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a sanity check */ - BUG_ON(cgroup_subsys[ss->subsys_id] != ss); - return 0; - } - - /* init base cftset */ - cgroup_init_cftsets(ss); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - cgroup_subsys[ss->subsys_id] = ss; - - /* - * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the dummy root - * attachment). - */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); - if (IS_ERR(css)) { - /* failure case - need to deassign the cgroup_subsys[] slot. */ - cgroup_subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - ss->root = &cgroup_dummy_root; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_css(css, ss, cgroup_dummy_top); - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { - /* skip entries that we already rehashed */ - if (cset->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hash_del(&cset->hlist); - /* set new value */ - cset->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - ret = online_css(css); - if (ret) { - ss->css_free(css); - goto err_unload; - } - - /* success! */ - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return 0; - -err_unload: - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - /* @ss can't be mounted here as try_module_get() would fail */ - cgroup_unload_subsys(ss); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cgrp_cset_link *link; - struct cgroup_subsys_state *css; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get() in rebind_subsystems() should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &cgroup_dummy_root); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - css = cgroup_css(cgroup_dummy_top, ss); - if (css) - offline_css(css); - - /* deassign the subsys_id */ - cgroup_subsys[ss->subsys_id] = NULL; - - /* - * disentangle the css from all css_sets attached to the dummy - * top. as in loading, we need to pay our respects to the hashtable - * gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { - struct css_set *cset = link->cset; - unsigned long key; - - hash_del(&cset->hlist); - cset->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the cgroup_dummy_top and free it - - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. - */ - if (css) - ss->css_free(css); - RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); - /** * cgroup_init_early - cgroup initialization at system boot * @@ -4763,8 +4527,7 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - /* at bootup time, we don't worry about modular subsystems */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->css_alloc); @@ -4797,7 +4560,7 @@ int __init cgroup_init(void) if (err) return err; - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); } @@ -5032,15 +4795,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, and the builtin section of the subsys - * array is immutable, so we don't need to lock the - * subsys array here. On the other hand, modular section - * of the array can be freed at module unload, so we - * can't touch that. - */ - for_each_builtin_subsys(ss, i) + for_each_subsys(ss, i) if (ss->fork) ss->fork(child); } @@ -5105,11 +4860,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) RCU_INIT_POINTER(tsk->cgroups, &init_css_set); if (run_callbacks && need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, see cgroup_post_fork() for details. - */ - for_each_builtin_subsys(ss, i) { + /* see cgroup_post_fork() for details */ + for_each_subsys(ss, i) { if (ss->exit) { struct cgroup_subsys_state *old_css = cset->subsys[i]; struct cgroup_subsys_state *css = task_css(tsk, i); @@ -5228,11 +4980,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about - * module subsystems, so we don't worry about them. - */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 9fc7f90d034c..9e5ad5d74e60 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -110,5 +110,4 @@ struct cgroup_subsys net_cls_subsys = { .attach = cgrp_attach, .subsys_id = net_cls_subsys_id, .base_cftypes = ss_files, - .module = THIS_MODULE, }; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cc3a31e7dc08..857e1603f9b7 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -252,7 +252,6 @@ struct cgroup_subsys net_prio_subsys = { .attach = net_prio_attach, .subsys_id = net_prio_subsys_id, .base_cftypes = ss_files, - .module = THIS_MODULE, }; static int netprio_device_event(struct notifier_block *unused, -- cgit v1.2.3 From 073219e995b4a3f8cf1ce8228b7ef440b6994ac0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: clean up cgroup_subsys names and initialization cgroup_subsys is a bit messier than it needs to be. * The name of a subsys can be different from its internal identifier defined in cgroup_subsys.h. Most subsystems use the matching name but three - cpu, memory and perf_event - use different ones. * cgroup_subsys_id enums are postfixed with _subsys_id and each cgroup_subsys is postfixed with _subsys. cgroup.h is widely included throughout various subsystems, it doesn't and shouldn't have claim on such generic names which don't have any qualifier indicating that they belong to cgroup. * cgroup_subsys->subsys_id should always equal the matching cgroup_subsys_id enum; however, we require each controller to initialize it and then BUG if they don't match, which is a bit silly. This patch cleans up cgroup_subsys names and initialization by doing the followings. * cgroup_subsys_id enums are now postfixed with _cgrp_id, and each cgroup_subsys with _cgrp_subsys. * With the above, renaming subsys identifiers to match the userland visible names doesn't cause any naming conflicts. All non-matching identifiers are renamed to match the official names. cpu_cgroup -> cpu mem_cgroup -> memory perf -> perf_event * controllers no longer need to initialize ->subsys_id and ->name. They're generated in cgroup core and set automatically during boot. * Redundant cgroup_subsys declarations removed. * While updating BUG_ON()s in cgroup_init_early(), convert them to WARN()s. BUGging that early during boot is stupid - the kernel can't print anything, even through serial console and the trap handler doesn't even link stack frame properly for back-tracing. This patch doesn't introduce any behavior changes. v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). Signed-off-by: Tejun Heo Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: "Rafael J. Wysocki" Acked-by: Michal Hocko Acked-by: Peter Zijlstra Acked-by: Aristeu Rozanski Acked-by: Ingo Molnar Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Serge E. Hallyn Cc: Vivek Goyal Cc: Thomas Graf --- block/blk-cgroup.c | 8 +++----- block/blk-cgroup.h | 2 +- fs/bio.c | 2 +- include/linux/cgroup.h | 7 ++++--- include/linux/cgroup_subsys.h | 6 +++--- include/linux/hugetlb_cgroup.h | 2 +- include/linux/memcontrol.h | 2 +- include/net/cls_cgroup.h | 2 +- include/net/netprio_cgroup.h | 2 +- kernel/cgroup.c | 34 ++++++++++++++++++++-------------- kernel/cgroup_freezer.c | 8 ++------ kernel/cpuset.c | 10 ++++------ kernel/events/core.c | 8 +++----- kernel/sched/core.c | 6 ++---- kernel/sched/cpuacct.c | 6 ++---- mm/hugetlb_cgroup.c | 9 +++------ mm/memcontrol.c | 22 ++++++++++------------ net/core/netclassid_cgroup.c | 6 ++---- net/core/netprio_cgroup.c | 4 +--- net/ipv4/tcp_memcontrol.c | 2 +- security/device_cgroup.c | 8 ++------ 21 files changed, 68 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 660d419918a7..1cef07cf9c21 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -906,16 +906,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", +struct cgroup_subsys blkio_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .subsys_id = blkio_subsys_id, .base_cftypes = blkcg_files, }; -EXPORT_SYMBOL_GPL(blkio_subsys); +EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1105,7 +1103,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) /* everything is in place, add intf files for the new policy */ if (pol->cftypes) - WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); + WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); ret = 0; out_unlock: mutex_unlock(&blkcg_pol_mutex); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 86154eab9523..453b528c8e19 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -186,7 +186,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_subsys_id)); + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) diff --git a/fs/bio.c b/fs/bio.c index 75c49a382239..4872102b839e 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1965,7 +1965,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_css(current, blkio_subsys_id); + css = task_css(current, blkio_cgrp_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d842a737d448..cd6611e622fd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -41,7 +41,7 @@ extern int cgroupstats_build(struct cgroupstats *stats, extern int proc_cgroup_show(struct seq_file *, void *); /* define the enumeration of all cgroup subsystems */ -#define SUBSYS(_x) _x ## _subsys_id, +#define SUBSYS(_x) _x ## _cgrp_id, enum cgroup_subsys_id { #include CGROUP_SUBSYS_COUNT, @@ -573,7 +573,6 @@ struct cgroup_subsys { struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); - int subsys_id; int disabled; int early_init; @@ -592,6 +591,8 @@ struct cgroup_subsys { bool broken_hierarchy; bool warned_broken_hierarchy; + /* the following two fields are initialized automtically during boot */ + int subsys_id; #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; @@ -606,7 +607,7 @@ struct cgroup_subsys { struct cftype_set base_cftset; }; -#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; +#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include #undef SUBSYS diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 11c42f6a25a8..768fe44e19f0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -12,7 +12,7 @@ SUBSYS(debug) #endif #if IS_ENABLED(CONFIG_CGROUP_SCHED) -SUBSYS(cpu_cgroup) +SUBSYS(cpu) #endif #if IS_ENABLED(CONFIG_CGROUP_CPUACCT) @@ -20,7 +20,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_MEMCG) -SUBSYS(mem_cgroup) +SUBSYS(memory) #endif #if IS_ENABLED(CONFIG_CGROUP_DEVICE) @@ -40,7 +40,7 @@ SUBSYS(blkio) #endif #if IS_ENABLED(CONFIG_CGROUP_PERF) -SUBSYS(perf) +SUBSYS(perf_event) #endif #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 787bba3bf552..0129f89cf98d 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -49,7 +49,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) static inline bool hugetlb_cgroup_disabled(void) { - if (hugetlb_subsys.disabled) + if (hugetlb_cgrp_subsys.disabled) return true; return false; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index abd0113b6620..eccfb4a4b379 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -162,7 +162,7 @@ extern int do_swap_account; static inline bool mem_cgroup_disabled(void) { - if (mem_cgroup_subsys.disabled) + if (memory_cgrp_subsys.disabled) return true; return false; } diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 9cf2d5ef38d9..c15d39456e14 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -34,7 +34,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - classid = container_of(task_css(p, net_cls_subsys_id), + classid = container_of(task_css(p, net_cls_cgrp_id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index b7ff5bd3c3c3..f2a9597ff53c 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -33,7 +33,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx; rcu_read_lock(); - css = task_css(p, net_prio_subsys_id); + css = task_css(p, net_prio_cgrp_id); idx = css->cgroup->id; rcu_read_unlock(); return idx; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ccb16b47e293..fe3f7253aa90 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -120,10 +120,18 @@ static struct workqueue_struct *cgroup_destroy_wq; static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* generate an array of cgroup subsystem pointers */ -#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, static struct cgroup_subsys *cgroup_subsys[] = { #include }; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { +#include +}; +#undef SUBSYS /* * The dummy hierarchy, reserved for the subsystems that are otherwise @@ -1076,7 +1084,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) BUG_ON(!mutex_is_locked(&cgroup_mutex)); #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); + mask = ~(1UL << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -4528,15 +4536,15 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for_each_subsys(ss, i) { - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->css_alloc); - BUG_ON(!ss->css_free); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->subsys_id, + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, + ss->subsys_id, ss->name); + WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, + "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + + ss->subsys_id = i; + ss->name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss); @@ -5167,11 +5175,9 @@ static struct cftype debug_files[] = { { } /* terminate */ }; -struct cgroup_subsys debug_subsys = { - .name = "debug", +struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6c3154e477f6..98ea26a99076 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) static inline struct freezer *task_freezer(struct task_struct *task) { - return css_freezer(task_css(task, freezer_subsys_id)); + return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) @@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -struct cgroup_subsys freezer_subsys; - static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -473,13 +471,11 @@ static struct cftype files[] = { { } /* terminate */ }; -struct cgroup_subsys freezer_subsys = { - .name = "freezer", +struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, - .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f1..2d018c795fea 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return css_cs(task_css(task, cpuset_subsys_id)); + return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) @@ -1521,7 +1521,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, - cpuset_subsys_id); + cpuset_cgrp_id); struct cpuset *cs = css_cs(css); struct cpuset *oldcs = css_cs(oldcss); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); @@ -2024,8 +2024,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) kfree(cs); } -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", +struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, @@ -2033,7 +2032,6 @@ struct cgroup_subsys cpuset_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, }; @@ -2699,7 +2697,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_css(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_cgrp_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd3..64903731d834 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -342,7 +342,7 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_css(task, perf_subsys_id), + return container_of(task_css(task, perf_event_cgrp_id), struct perf_cgroup, css); } @@ -595,7 +595,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, rcu_read_lock(); - css = css_from_dir(f.file->f_dentry, &perf_subsys); + css = css_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -8055,9 +8055,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, +struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..d4cfc5561830 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7176,7 +7176,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7957,8 +7957,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -7966,7 +7965,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f905..c143ee380e3a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index cb00829bb466..b135853e68f3 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -30,7 +30,6 @@ struct hugetlb_cgroup { #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -struct cgroup_subsys hugetlb_subsys __read_mostly; static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) @@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[4]; memset(cft, 0, sizeof(*cft)); - WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); + WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files)); return; } @@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } -struct cgroup_subsys hugetlb_subsys = { - .name = "hugetlb", +struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, - .subsys_id = hugetlb_subsys_id, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53385cd4e6f0..04a97bce2270 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,8 +66,8 @@ #include -struct cgroup_subsys mem_cgroup_subsys __read_mostly; -EXPORT_SYMBOL(mem_cgroup_subsys); +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &mem_cgroup_subsys); + css = css_from_id(id - 1, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } @@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) @@ -1702,7 +1702,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_lock(); mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + task_cgrp = task_cgroup(p, memory_cgrp_id); ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); if (ret < 0) { @@ -6187,7 +6187,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, ret = -EINVAL; cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, - &mem_cgroup_subsys); + &memory_cgrp_subsys); if (cfile_css == css && css_tryget(css)) ret = 0; @@ -6566,11 +6566,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * unfortunate state in our controller. */ if (parent != root_mem_cgroup) - mem_cgroup_subsys.broken_hierarchy = true; + memory_cgrp_subsys.broken_hierarchy = true; } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &mem_cgroup_subsys); + return memcg_init_kmem(memcg, &memory_cgrp_subsys); } /* @@ -7264,9 +7264,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) mem_cgroup_from_css(root_css)->use_hierarchy = true; } -struct cgroup_subsys mem_cgroup_subsys = { - .name = "memory", - .subsys_id = mem_cgroup_subsys_id, +struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, @@ -7292,7 +7290,7 @@ __setup("swapaccount=", enable_swap_account); static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } static void __init enable_swap_cgroup(void) diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 9e5ad5d74e60..b865662fba71 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return css_cls_state(task_css(p, net_cls_subsys_id)); + return css_cls_state(task_css(p, net_cls_cgrp_id)); } EXPORT_SYMBOL_GPL(task_cls_state); @@ -102,12 +102,10 @@ static struct cftype ss_files[] = { { } /* terminate */ }; -struct cgroup_subsys net_cls_subsys = { - .name = "net_cls", +struct cgroup_subsys net_cls_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = cgrp_attach, - .subsys_id = net_cls_subsys_id, .base_cftypes = ss_files, }; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 857e1603f9b7..d7d23e28fafd 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -244,13 +244,11 @@ static struct cftype ss_files[] = { { } /* terminate */ }; -struct cgroup_subsys net_prio_subsys = { - .name = "net_prio", +struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, - .subsys_id = net_prio_subsys_id, .base_cftypes = ss_files, }; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7e522c558ba..20a0aca9131e 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -219,7 +219,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index d3b6d2cd3a06..7f88bcde7c61 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -58,11 +58,9 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { - return css_to_devcgroup(task_css(task, devices_subsys_id)); + return css_to_devcgroup(task_css(task, devices_cgrp_id)); } -struct cgroup_subsys devices_subsys; - /* * called under devcgroup_mutex */ @@ -684,13 +682,11 @@ static struct cftype dev_cgroup_files[] = { { } /* terminate */ }; -struct cgroup_subsys devices_subsys = { - .name = "devices", +struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, - .subsys_id = devices_subsys_id, .base_cftypes = dev_cgroup_files, }; -- cgit v1.2.3 From aec25020f5d4b69aea5317551d1cb7043f6b04fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: rename cgroup_subsys->subsys_id to ->id It's no longer referenced outside cgroup core, so renaming is easy. Let's rename it for consistency & brevity. This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cd6611e622fd..198c7fcd727e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -548,7 +548,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); (task) = cgroup_taskset_next((tset))) \ if (!(skip_css) || \ cgroup_taskset_cur_css((tset), \ - (skip_css)->ss->subsys_id) != (skip_css)) + (skip_css)->ss->id) != (skip_css)) /* * Control Group subsystem type. @@ -592,7 +592,7 @@ struct cgroup_subsys { bool warned_broken_hierarchy; /* the following two fields are initialized automtically during boot */ - int subsys_id; + int id; #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fe3f7253aa90..5a77ca0784a6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -198,7 +198,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) - return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; @@ -3982,7 +3982,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); + rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4014,7 +4014,7 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; css->cgroup->nr_css++; - rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; } @@ -4034,7 +4034,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); } /** @@ -4065,7 +4065,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free; @@ -4292,7 +4292,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* * Killing would put the base ref, but we need to keep it alive @@ -4496,7 +4496,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = css; + init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4536,14 +4536,14 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for_each_subsys(ss, i) { - WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->subsys_id, + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, - ss->subsys_id, ss->name); + ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); - ss->subsys_id = i; + ss->id = i; ss->name = cgroup_subsys_name[i]; if (ss->early_init) -- cgit v1.2.3 From 5a17f543ed6808e9085063277fe46795dea484bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:47 -0500 Subject: cgroup: improve css_from_dir() into css_tryget_from_dir() css_from_dir() returns the matching css (cgroup_subsys_state) given a dentry and subsystem. The function doesn't pin the css before returning and requires the caller to be holding RCU read lock or cgroup_mutex and handling pinning on the caller side. Given that users of the function are likely to want to pin the returned css (both existing users do) and that getting and putting css's are very cheap, there's no reason for the interface to be tricky like this. Rename css_from_dir() to css_tryget_from_dir() and make it try to pin the found css and return it only if pinning succeeded. The callers are updated so that they no longer do RCU locking and pinning around the function and just use the returned css. This will also ease converting cgroup to kernfs. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 25 ++++++++++++++++--------- kernel/events/core.c | 17 +---------------- mm/memcontrol.c | 16 +++++++--------- 4 files changed, 26 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c86ba7ff7a7e..1ba4fc08f776 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -825,8 +825,8 @@ int css_scan_tasks(struct cgroup_subsys_state *css, int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2de8decfd99f..fc2db071d95e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4978,28 +4978,35 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under cgroup_mutex or RCU read lock. The caller is - * responsible for pinning the returned css if it needs to be accessed - * outside the critical section. + * If @dentry is a directory for a cgroup which has @ss enabled on it, try + * to get the corresponding css and return it. If such css doesn't exist + * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - - cgroup_assert_mutex_or_rcu_locked(); + struct cgroup_subsys_state *css; /* is @dentry a cgroup dir? */ if (!dentry->d_inode || dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); + rcu_read_lock(); + cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); + css = cgroup_css(cgrp, ss); + + if (!css || !css_tryget(css)) + css = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return css; } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 64903731d834..a3c3ab50271a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -370,11 +370,6 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline bool perf_tryget_cgroup(struct perf_event *event) -{ - return css_tryget(&event->cgrp->css); -} - static inline void perf_put_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); @@ -593,9 +588,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - rcu_read_lock(); - - css = css_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); + css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -604,13 +597,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; - /* must be done before we fput() the file */ - if (!perf_tryget_cgroup(event)) { - event->cgrp = NULL; - ret = -ENOENT; - goto out; - } - /* * all events in a group must monitor * the same cgroup because a task belongs @@ -621,7 +607,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - rcu_read_unlock(); fdput(f); return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04a97bce2270..102ab48ffa13 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6183,17 +6183,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - rcu_read_lock(); - + cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, - &memory_cgrp_subsys); - if (cfile_css == css && css_tryget(css)) - ret = 0; - - rcu_read_unlock(); - if (ret) + if (IS_ERR(cfile_css)) goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } ret = event->register_event(memcg, event->eventfd, buffer); if (ret) -- cgit v1.2.3 From de00ffa56ea3132c6013fc8f07133b8a1014cf53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: make cgroup_subsys->base_cftypes use cgroup_add_cftypes() Currently, cgroup_subsys->base_cftypes registration is different from dynamic cftypes registartion. Instead of going through cgroup_add_cftypes(), cgroup_init_subsys() invokes cgroup_init_cftsets() which makes use of cgroup_subsys->base_cftset which doesn't involve dynamic allocation. While avoiding dynamic allocation is somewhat nice, having two separate paths for cftypes registration is nasty, especially as we're planning to add more operations during cftypes registration. This patch drops cgroup_init_cftsets() and cgroup_subsys->base_cftset and registers base_cftypes using cgroup_add_cftypes(). This is done as a separate step in cgroup_init() instead of a part of cgroup_init_subsys(). This is because cgroup_init_subsys() can be called very early during boot when kmalloc() isn't available yet. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 +-- kernel/cgroup.c | 29 ++++++++--------------------- 2 files changed, 9 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1ba4fc08f776..c4350a82320b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -604,9 +604,8 @@ struct cgroup_subsys { /* list of cftype_sets */ struct list_head cftsets; - /* base cftypes, automatically [de]registered with subsys itself */ + /* base cftypes, automatically registered with subsys itself */ struct cftype *base_cftypes; - struct cftype_set base_cftset; }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f204429d108..eb002c622cd6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4503,25 +4503,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) return ret; } -static void __init cgroup_init_cftsets(struct cgroup_subsys *ss) -{ - INIT_LIST_HEAD(&ss->cftsets); - - /* - * base_cftset is embedded in subsys itself, no need to worry about - * deregistration. - */ - if (ss->base_cftypes) { - struct cftype *cft; - - for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) - cft->ss = ss; - - ss->base_cftset.cfts = ss->base_cftypes; - list_add_tail(&ss->base_cftset.node, &ss->cftsets); - } -} - static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; @@ -4531,8 +4512,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - /* init base cftset */ - cgroup_init_cftsets(ss); + INIT_LIST_HEAD(&ss->cftsets); /* Create the top cgroup state for this subsystem */ ss->root = &cgroup_dummy_root; @@ -4621,6 +4601,13 @@ int __init cgroup_init(void) for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); + + /* + * cftype registration needs kmalloc and can't be done + * during early_init. Register base cftypes separately. + */ + if (ss->base_cftypes) + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); } /* allocate id for the dummy hierarchy */ -- cgit v1.2.3 From 5f46990787e2721b4db190ddc8af6fdbe8f010d7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: update the meaning of cftype->max_write_len cftype->max_write_len is used to extend the maximum size of writes. It's interpreted in such a way that the actual maximum size is one less than the specified value. The default size is defined by CGROUP_LOCAL_BUFFER_SIZE. Its interpretation is quite confusing - its value is decremented by 1 and then compared for equality with max size, which means that the actual default size is CGROUP_LOCAL_BUFFER_SIZE - 2, which is 62 chars. There's no point in having a limit that low. Update its definition so that it means the actual string length sans termination and anything below PAGE_SIZE-1 is treated as PAGE_SIZE-1. .max_write_len for "release_agent" is updated to PATH_MAX-1 and cgroup_release_agent_write() is updated so that the redundant strlen() check is removed and it uses strlcpy() instead of strcpy(). .max_write_len initializations in blk-throttle.c and cfq-iosched.c are no longer necessary and removed. The one in cpuset is kept unchanged as it's an approximated value to begin with. This will also make transition to kernfs smoother. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-throttle.c | 4 ---- block/cfq-iosched.c | 3 --- include/linux/cgroup.h | 5 +++-- kernel/cgroup.c | 18 ++++++++---------- 4 files changed, 11 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1474c3ab7e72..861c363e4129 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1425,28 +1425,24 @@ static struct cftype throtl_files[] = { .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, - .max_write_len = 256, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, - .max_write_len = 256, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, - .max_write_len = 256, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, - .max_write_len = 256, }, { .name = "throttle.io_service_bytes", diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 744833b630c6..461187943392 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1838,7 +1838,6 @@ static struct cftype cfq_blkcg_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, - .max_write_len = 256, }, { .name = "weight", @@ -1853,7 +1852,6 @@ static struct cftype cfq_blkcg_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cfqg_print_weight_device, .write_string = cfqg_set_weight_device, - .max_write_len = 256, }, { .name = "weight", @@ -1866,7 +1864,6 @@ static struct cftype cfq_blkcg_files[] = { .name = "leaf_weight_device", .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, - .max_write_len = 256, }, { .name = "leaf_weight", diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c4350a82320b..63feaed83bc6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -400,8 +400,9 @@ struct cftype { umode_t mode; /* - * If non-zero, defines the maximum length of string that can - * be passed to write_string; defaults to 64 + * The maximum length of string, excluding trailing nul, that can + * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is + * assumed. */ size_t max_write_len; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index eb002c622cd6..fde3633ef389 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2213,13 +2213,14 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, static int cgroup_release_agent_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; + struct cgroupfs_root *root = css->cgroup->root; + + BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; spin_lock(&release_agent_path_lock); - strcpy(css->cgroup->root->release_agent_path, buffer); + strlcpy(root->release_agent_path, buffer, + sizeof(root->release_agent_path)); spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); return 0; @@ -2245,20 +2246,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 - static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, size_t nbytes, loff_t *ppos) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; + size_t max_bytes = max(cft->max_write_len, PAGE_SIZE); char *buf; int ret; - if (nbytes >= max_bytes) + if (nbytes > max_bytes) return -E2BIG; buf = kmalloc(nbytes + 1, GFP_KERNEL); @@ -3919,7 +3917,7 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; -- cgit v1.2.3 From b1664924062393bb048203bd4622e0b1c9e1d328 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: introduce cgroup_ino() mm/memory-failure.c::hwpoison_filter_task() has been reaching into cgroup to extract the associated ino to be used as a filtering criterion. This is an implementation detail which shouldn't be depended upon from outside cgroup proper and is about to change with the scheduled kernfs conversion. This patch introduces a proper interface to determine the associated ino, cgroup_ino(), and updates hwpoison_filter_task() to use it instead of reaching directly into cgroup. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Andi Kleen Cc: Wu Fengguang --- include/linux/cgroup.h | 9 +++++++++ kernel/cgroup.c | 5 ++++- mm/memory-failure.c | 8 ++------ 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 63feaed83bc6..06f577764414 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -507,6 +507,15 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) return rcu_dereference(cgrp->name)->name; } +/* returns ino associated with a cgroup, 0 indicates unmounted root */ +static inline ino_t cgroup_ino(struct cgroup *cgrp) +{ + if (cgrp->dentry) + return cgrp->dentry->d_inode->i_ino; + else + return 0; +} + static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { struct cgroup_open_file *of = seq->private; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 42e588ef62d1..11f7a05e791e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -792,7 +792,10 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); + do { + /* ino 0 is reserved for dummy_root */ + inode->i_ino = get_next_ino(); + } while (!inode->i_ino); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4f08a2d61487..9b5933c66c16 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; -- cgit v1.2.3 From 59f5296b51b86718dd6eecf0a268b2f1a1ec0a2d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: misc preps for kernfs conversion * Un-inline seq_css(). After kernfs conversion, the function will need to dereference internal data structures. * Add cgroup_get/put_root() and replace direct super_block->s_active manipulatinos with them. These will be converted to kernfs_root refcnting. * Add cgroup_get/put() and replace dget/put() on cgrp->dentry with them. These will be converted to kernfs refcnting. * Update current_css_set_cg_links_read() to use cgroup_name() instead of reaching into the dentry name. The end result is the same. These changes don't make functional differences but will make transition to kernfs easier. v2: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 8 ++--- kernel/cgroup.c | 85 +++++++++++++++++++++++++++++++------------------- 2 files changed, 55 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 06f577764414..523277871913 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -516,18 +516,14 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp) return 0; } -static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) -{ - struct cgroup_open_file *of = seq->private; - return of->cfe->css; -} - static inline struct cftype *seq_cft(struct seq_file *seq) { struct cgroup_open_file *of = seq->private; return of->cfe->type; } +struct cgroup_subsys_state *seq_css(struct seq_file *seq); + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11f7a05e791e..9e9e8fd632d8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -169,6 +169,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; +static void cgroup_put(struct cgroup *cgrp); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -204,6 +205,13 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } +struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct cgroup_open_file *of = seq->private; + return of->cfe->css; +} +EXPORT_SYMBOL_GPL(seq_css); + /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested @@ -682,6 +690,16 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +static void cgroup_get_root(struct cgroupfs_root *root) +{ + atomic_inc(&root->sb->s_active); +} + +static void cgroup_put_root(struct cgroupfs_root *root) +{ + deactivate_super(root->sb); +} + /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex held. @@ -837,18 +855,14 @@ static void cgroup_free_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); /* - * We get a ref to the parent's dentry, and put the ref when - * this cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. + * We get a ref to the parent, and put the ref when this cgroup is + * being freed, so it's guaranteed that the parent won't be + * destroyed before its children. */ - dput(cgrp->parent->dentry); + cgroup_put(cgrp->parent); - /* - * Drop the active superblock reference that we took when we - * created the cgroup. This will free cgrp->root, if we are - * holding the last reference to @sb. - */ - deactivate_super(cgrp->root->sb); + /* put the root reference that we took when we created the cgroup */ + cgroup_put_root(cgrp->root); cgroup_pidlist_destroy_all(cgrp); @@ -866,6 +880,11 @@ static void cgroup_free_rcu(struct rcu_head *head) queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } +static void cgroup_get(struct cgroup *cgrp) +{ + dget(cgrp->dentry); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -899,6 +918,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static void cgroup_put(struct cgroup *cgrp) +{ + dput(cgrp->dentry); +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -2724,7 +2748,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; - struct dentry *prev = NULL; + struct cgroup *prev = NULL; struct inode *inode; struct cgroup_subsys_state *css; u64 update_before; @@ -2754,9 +2778,10 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) continue; inode = cgrp->dentry->d_inode; - dget(cgrp->dentry); - dput(prev); - prev = cgrp->dentry; + cgroup_get(cgrp); + if (prev) + cgroup_put(prev); + prev = cgrp; mutex_unlock(&cgroup_tree_mutex); mutex_lock(&inode->i_mutex); @@ -2768,8 +2793,8 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) break; } mutex_unlock(&cgroup_tree_mutex); - dput(prev); - deactivate_super(sb); + cgroup_put(prev); + cgroup_put_root(ss->root); return ret; } @@ -3863,11 +3888,9 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, */ static void cgroup_dput(struct cgroup *cgrp) { - struct super_block *sb = cgrp->root->sb; - - atomic_inc(&sb->s_active); - dput(cgrp->dentry); - deactivate_super(sb); + cgroup_get_root(cgrp->root); + cgroup_put(cgrp); + cgroup_put_root(cgrp->root); } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, @@ -4118,7 +4141,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free; - dget(cgrp->dentry); + cgroup_get(cgrp); css_get(css->parent); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && @@ -4197,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * can be done outside cgroup_mutex, since the sb can't * disappear while someone has an open control file on the * fs */ - atomic_inc(&sb->s_active); + cgroup_get_root(root); init_cgroup_housekeeping(cgrp); @@ -4231,7 +4254,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, root->number_of_cgroups++; /* hold a ref to the parent's dentry */ - dget(parent->dentry); + cgroup_get(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4261,7 +4284,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); /* Release the reference count that we took on the superblock */ - deactivate_super(sb); + cgroup_put_root(root); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -4493,7 +4516,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - struct dentry *d = cgrp->dentry; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4501,7 +4523,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - dput(d); + cgroup_put(cgrp); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); @@ -5161,12 +5183,11 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name; + const char *name = "?"; + + if (c != cgroup_dummy_top) + name = cgroup_name(c); - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } -- cgit v1.2.3 From 2bd59d48ebfb3df41ee56938946ca0dd30887312 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: convert to kernfs cgroup filesystem code was derived from the original sysfs implementation which was heavily intertwined with vfs objects and locking with the goal of re-using the existing vfs infrastructure. That experiment turned out rather disastrous and sysfs switched, a long time ago, to distributed filesystem model where a separate representation is maintained which is queried by vfs. Unfortunately, cgroup stuck with the failed experiment all these years and accumulated even more problems over time. Locking and object lifetime management being entangled with vfs is probably the most egregious. vfs is never designed to be misused like this and cgroup ends up jumping through various convoluted dancing to make things work. Even then, operations across multiple cgroups can't be done safely as it'll deadlock with rename locking. Recently, kernfs is separated out from sysfs so that it can be used by users other than sysfs. This patch converts cgroup to use kernfs, which will bring the following benefits. * Separation from vfs internals. Locking and object lifetime management is contained in cgroup proper making things a lot simpler. This removes significant amount of locking convolutions, hairy object lifetime rules and the restriction on multi-cgroup operations. * Can drop a lot of code to implement filesystem interface as most are provided by kernfs. * Proper "severing" semantics, which allows controllers to not worry about lingering file accesses after offline. While the preceding patches did as much as possible to make the transition less painful, large part of the conversion has to be one discrete step making this patch rather large. The rest of the commit message lists notable changes in different areas. Overall ------- * vfs constructs replaced with kernfs ones. cgroup->dentry w/ ->kn, cgroupfs_root->sb w/ ->kf_root. * All dentry accessors are removed. Helpers to map from kernfs constructs are added. * All vfs plumbing around dentry, inode and bdi removed. * cgroup_mount() now directly looks for matching root and then proceeds to create a new one if not found. Synchronization and object lifetime ----------------------------------- * vfs inode locking removed. Among other things, this removes the need for the convolution in cgroup_cfts_commit(). Future patches will further simplify it. * vfs refcnting replaced with cgroup internal ones. cgroup->refcnt, cgroupfs_root->refcnt added. cgroup_put_root() now directly puts root->refcnt and when it reaches zero proceeds to destroy it thus merging cgroup_put_root() and the former cgroup_kill_sb(). Simliarly, cgroup_put() now directly schedules cgroup_free_rcu() when refcnt reaches zero. * Unlike before, kernfs objects don't hold onto cgroup objects. When cgroup destroys a kernfs node, all existing operations are drained and the association is broken immediately. The same for cgroupfs_roots and mounts. * All operations which come through kernfs guarantee that the associated cgroup is and stays valid for the duration of operation; however, there are two paths which need to find out the associated cgroup from dentry without going through kernfs - css_tryget_from_dir() and cgroupstats_build(). For these two, kernfs_node->priv is RCU managed so that they can dereference it under RCU read lock. File and directory handling --------------------------- * File and directory operations converted to kernfs_ops and kernfs_syscall_ops. * xattrs is implicitly supported by kernfs. No need to worry about it from cgroup. This means that "xattr" mount option is no longer necessary. A future patch will add a deprecated warning message when sane_behavior. * When cftype->max_write_len > PAGE_SIZE, it's necessary to make a private copy of one of the kernfs_ops to set its atomic_write_len. cftype->kf_ops is added and cgroup_init/exit_cftypes() are updated to handle it. * cftype->lockdep_key added so that kernfs lockdep annotation can be per cftype. * Inidividual file entries and open states are now managed by kernfs. No need to worry about them from cgroup. cfent, cgroup_open_file and their friends are removed. * kernfs_nodes are created deactivated and kernfs_activate() invocations added to places where creation of new nodes are committed. * cgroup_rmdir() uses kernfs_[un]break_active_protection() for self-removal. v2: - Li pointed out in an earlier patch that specifying "name=" during mount without subsystem specification should succeed if there's an existing hierarchy with a matching name although it should fail with -EINVAL if a new hierarchy should be created. Prior to the conversion, this used by handled by deferring failure from NULL return from cgroup_root_from_opts(), which was necessary because root was being created before checking for existing ones. Note that cgroup_root_from_opts() returned an ERR_PTR() value for error conditions which require immediate mount failure. As we now have separate search and creation steps, deferring failure from cgroup_root_from_opts() is no longer necessary. cgroup_root_from_opts() is updated to always return ERR_PTR() value on failure. - The logic to match existing roots is updated so that a mount attempt with a matching name but different subsys_mask are rejected. This was handled by a separate matching loop under the comment "Check for name clashes with existing mounts" but got lost during conversion. Merge the check into the main search loop. - Add __rcu __force casting in RCU_INIT_POINTER() in cgroup_destroy_locked() to avoid the sparse address space warning reported by kbuild test bot. Maybe we want an explicit interface to use kn->priv as RCU protected pointer? v3: Make CONFIG_CGROUPS select CONFIG_KERNFS. v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: kbuild test robot fengguang.wu@intel.com> --- include/linux/cgroup.h | 52 +-- init/Kconfig | 1 + kernel/cgroup.c | 1115 ++++++++++++++++-------------------------------- 3 files changed, 383 insertions(+), 785 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 523277871913..0e45a932b823 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -18,10 +18,10 @@ #include #include #include -#include #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -159,16 +159,17 @@ struct cgroup { /* the number of attached css's */ int nr_css; + atomic_t refcnt; + /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. */ struct list_head sibling; /* my parent's children */ struct list_head children; /* my children */ - struct list_head files; /* my files */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry, RCU protected */ + struct kernfs_node *kn; /* cgroup kernfs entry */ /* * Monotonically increasing unique serial number which defines a @@ -222,9 +223,6 @@ struct cgroup { /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; - - /* directory xattrs */ - struct simple_xattrs xattrs; }; #define MAX_CGROUP_ROOT_NAMELEN 64 @@ -291,15 +289,17 @@ enum { /* * A cgroupfs_root represents the root of a cgroup hierarchy, and may be - * associated with a superblock to form an active hierarchy. This is + * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ struct cgroupfs_root { - struct super_block *sb; + struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ unsigned long subsys_mask; + atomic_t refcnt; + /* Unique id for this hierarchy. */ int hierarchy_id; @@ -415,6 +415,9 @@ struct cftype { */ struct cgroup_subsys *ss; + /* kernfs_ops to use, initialized automatically during registration */ + struct kernfs_ops *kf_ops; + /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() @@ -460,6 +463,10 @@ struct cftype { * kick type for multiplexing. */ int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lockdep_key; +#endif }; /* @@ -472,26 +479,6 @@ struct cftype_set { struct cftype *cfts; }; -/* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't - * access directly. - */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; - -/* seq_file->private points to the following, only ->priv is public */ -struct cgroup_open_file { - struct cfent *cfe; - void *priv; -}; - /* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. @@ -510,16 +497,17 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) /* returns ino associated with a cgroup, 0 indicates unmounted root */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - if (cgrp->dentry) - return cgrp->dentry->d_inode->i_ino; + if (cgrp->kn) + return cgrp->kn->ino; else return 0; } static inline struct cftype *seq_cft(struct seq_file *seq) { - struct cgroup_open_file *of = seq->private; - return of->cfe->type; + struct kernfs_open_file *of = seq->private; + + return of->kn->priv; } struct cgroup_subsys_state *seq_css(struct seq_file *seq); diff --git a/init/Kconfig b/init/Kconfig index 009a797dd242..3f74784560a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -854,6 +854,7 @@ config NUMA_BALANCING menuconfig CGROUPS boolean "Control Group support" + select KERNFS help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d8efca44de5f..cda614da40cf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -40,9 +40,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -50,7 +48,6 @@ #include #include #include -#include #include #include #include /* TODO: replace with more sophisticated array */ @@ -176,7 +173,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** @@ -209,8 +205,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) struct cgroup_subsys_state *seq_css(struct seq_file *seq) { - struct cgroup_open_file *of = seq->private; - return of->cfe->css; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = seq_cft(seq); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->dummy_css; } EXPORT_SYMBOL_GPL(seq_css); @@ -276,21 +286,6 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cfent *__d_cfe(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return __d_cfe(dentry)->type; -} - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -692,6 +687,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +{ + struct cgroup *top_cgrp = kf_root->kn->priv; + + return top_cgrp->root; +} + static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) { int id; @@ -730,30 +732,37 @@ static void cgroup_free_root(struct cgroupfs_root *root) static void cgroup_get_root(struct cgroupfs_root *root) { - atomic_inc(&root->sb->s_active); + /* + * The caller must ensure that @root is alive, which can be + * achieved by holding a ref on one of the member cgroups or + * following a registered reference to @root while holding + * cgroup_tree_mutex. + */ + WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0); + atomic_inc(&root->refcnt); } static void cgroup_put_root(struct cgroupfs_root *root) { - deactivate_super(root->sb); -} - -static void cgroup_kill_sb(struct super_block *sb) -{ - struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; int ret; - BUG_ON(!root); + /* + * @root's refcnt reaching zero and its deregistration should be + * atomic w.r.t. cgroup_tree_mutex. This ensures that + * cgroup_get_root() is safe to invoke if @root is registered. + */ + mutex_lock(&cgroup_tree_mutex); + if (!atomic_dec_and_test(&root->refcnt)) { + mutex_unlock(&cgroup_tree_mutex); + return; + } + mutex_lock(&cgroup_mutex); BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - /* Rebind all subsystems back to the default hierarchy */ if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { ret = rebind_subsystems(root, 0, root->subsys_mask); @@ -783,11 +792,8 @@ static void cgroup_kill_sb(struct super_block *sb) mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - - simple_xattrs_free(&cgrp->xattrs); - kill_litter_super(sb); + kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -878,42 +884,10 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); -static const struct inode_operations cgroup_dir_inode_operations; +static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) -{ - struct inode *inode = new_inode(sb); - - if (inode) { - do { - /* ino 0 is reserved for dummy_root */ - inode->i_ino = get_next_ino(); - } while (!inode->i_ino); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; -} - static struct cgroup_name *cgroup_alloc_name(const char *name_str) { struct cgroup_name *name; @@ -983,8 +957,6 @@ static void cgroup_free_fn(struct work_struct *work) cgroup_pidlist_destroy_all(cgrp); - simple_xattrs_free(&cgrp->xattrs); - kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -999,81 +971,38 @@ static void cgroup_free_rcu(struct rcu_head *head) static void cgroup_get(struct cgroup *cgrp) { - dget(cgrp->dentry); -} - -static void cgroup_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - - BUG_ON(!(cgroup_is_dead(cgrp))); - - /* - * XXX: cgrp->id is only used to look up css's. As cgroup - * and css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); - } else { - struct cfent *cfe = __d_cfe(dentry); - struct cgroup *cgrp = dentry->d_parent->d_fsdata; - - WARN_ONCE(!list_empty(&cfe->node) && - cgrp != &cgrp->root->top_cgroup, - "cfe still linked for %s\n", cfe->type->name); - simple_xattrs_free(&cfe->xattrs); - kfree(cfe); - } - iput(inode); + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); + atomic_inc(&cgrp->refcnt); } static void cgroup_put(struct cgroup *cgrp) { - dput(cgrp->dentry); -} + if (!atomic_dec_and_test(&cgrp->refcnt)) + return; + if (WARN_ON_ONCE(!cgroup_is_dead(cgrp))) + return; -static void remove_dir(struct dentry *d) -{ - struct dentry *parent = dget(d->d_parent); + /* + * XXX: cgrp->id is only used to look up css's. As cgroup and + * css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. + */ + mutex_lock(&cgroup_mutex); + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); + cgrp->id = -1; - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { - struct cfent *cfe; + char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_tree_mutex); - - /* - * If we're doing cleanup due to failure of cgroup_create(), - * the corresponding @cfe may not exist. - */ - list_for_each_entry(cfe, &cgrp->files, node) { - struct dentry *d = cfe->dentry; - - if (cft && cfe->type != cft) - continue; - - dget(d); - d_delete(d); - simple_unlink(cgrp->dentry->d_inode, d); - list_del_init(&cfe->node); - dput(d); - - break; - } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** @@ -1096,22 +1025,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) } } -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - struct dentry *parent; - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { @@ -1179,13 +1092,15 @@ static int rebind_subsystems(struct cgroupfs_root *root, * now matches the bound subsystems. */ root->flags |= CGRP_ROOT_SUBSYS_BOUND; + kernfs_activate(cgrp->kn); return 0; } -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; @@ -1219,9 +1134,6 @@ struct cgroup_sb_opts { char *name; /* User explicitly requested empty subsystem */ bool none; - - struct cgroupfs_root *new_root; - }; /* @@ -1380,11 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static int cgroup_remount(struct super_block *sb, int *flags, char *data) +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1393,7 +1304,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -1439,34 +1349,26 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; } -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; - static void init_cgroup_housekeeping(struct cgroup *cgrp) { + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; + atomic_set(&root->refcnt, 1); INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; @@ -1475,32 +1377,12 @@ static void init_cgroup_root(struct cgroupfs_root *root) idr_init(&root->cgroup_idr); } -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_mask || opts->none) - && (opts->subsys_mask != root->subsys_mask)) - return 0; - - return 1; -} - static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; if (!opts->subsys_mask && !opts->none) - return NULL; + return ERR_PTR(-EINVAL); root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) @@ -1527,99 +1409,21 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; - - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; - - BUG_ON(!opts->subsys_mask && !opts->none); - - ret = set_anon_super(sb, NULL); - if (ret) - return ret; - - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; - - return 0; -} - -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, - }; - - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - - if (!inode) - return -ENOMEM; - - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; -} - static int cgroup_setup_root(struct cgroupfs_root *root) { LIST_HEAD(tmp_links); - struct super_block *sb = root->sb; struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; struct css_set *cset; - struct inode *inode; - const struct cred *cred; int i, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - BUG_ON(sb->s_root != NULL); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - ret = cgroup_get_rootdir(sb); - if (ret) { - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - return ret; - } - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); if (ret < 0) - goto out_unlock; + goto out; root_cgrp->id = ret; - /* check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto out_unlock; - /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding @@ -1628,34 +1432,29 @@ static int cgroup_setup_root(struct cgroupfs_root *root) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out_unlock; + goto out; /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ ret = cgroup_init_root_id(root, 2, 0); if (ret) - goto out_unlock; - - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; + goto out; - /* - * We're inside get_sb() and will call lookup_one_len() to create - * the root files, which doesn't work if SELinux is in use. The - * following cred dancing somehow works around it. See 2ce9738ba - * ("cgroupfs: use init_cred when populating new cgroupfs mount") - * for more details. - */ - cred = override_creds(&init_cred); + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + root_cgrp); + if (IS_ERR(root->kf_root)) { + ret = PTR_ERR(root->kf_root); + goto exit_root_id; + } + root_cgrp->kn = root->kf_root->kn; ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); if (ret) - goto rm_base_files; + goto destroy_root; ret = rebind_subsystems(root, root->subsys_mask, 0); if (ret) - goto rm_base_files; - - revert_creds(cred); + goto destroy_root; /* * There must be no failure case after here, since rebinding takes @@ -1677,15 +1476,16 @@ static int cgroup_setup_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); + kernfs_activate(root_cgrp->kn); ret = 0; - goto out_unlock; + goto out; -rm_base_files: - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); +destroy_root: + kernfs_destroy_root(root->kf_root); + root->kf_root = NULL; +exit_root_id: cgroup_exit_root_id(root); -out_unlock: - mutex_unlock(&inode->i_mutex); +out: free_cgrp_cset_links(&tmp_links); return ret; } @@ -1694,10 +1494,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - struct super_block *sb = NULL; - struct cgroupfs_root *root = NULL; + struct cgroupfs_root *root; struct cgroup_sb_opts opts; - struct cgroupfs_root *new_root; + struct dentry *dentry; int ret; mutex_lock(&cgroup_tree_mutex); @@ -1708,41 +1507,32 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. - */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_unlock; - } - opts.new_root = new_root; + /* look for a matching existing root */ + for_each_active_root(root) { + bool name_match = false; - /* Locate an existing or new sb for this hierarchy */ - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_free_root(opts.new_root); - goto out_unlock; - } + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - ret = cgroup_setup_root(root); - if (ret) - goto out_unlock; - } else { /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. */ - cgroup_free_root(opts.new_root); + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { @@ -1753,23 +1543,45 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } + + cgroup_get_root(root); + goto out_unlock; } - ret = 0; + /* no such thing, create a new one */ + root = cgroup_root_from_opts(&opts); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_unlock; + } + + ret = cgroup_setup_root(root); + if (ret) + cgroup_free_root(root); + out_unlock: mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - if (ret && !IS_ERR_OR_NULL(sb)) - deactivate_locked_super(sb); - kfree(opts.release_agent); kfree(opts.name); - if (!ret) - return dget(sb->s_root); - else + if (ret) return ERR_PTR(ret); + + dentry = kernfs_mount(fs_type, flags, root->kf_root); + if (IS_ERR(dentry)) + cgroup_put_root(root); + return dentry; +} + +static void cgroup_kill_sb(struct super_block *sb) +{ + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + + cgroup_put_root(root); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@ -2301,29 +2113,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = max(cft->max_write_len, PAGE_SIZE); - char *buf; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of->kn->priv; + struct cgroup_subsys_state *css; int ret; - if (nbytes > max_bytes) - return -E2BIG; - - buf = kmalloc(nbytes + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, userbuf, nbytes)) { - ret = -EFAULT; - goto out_free; - } - - buf[nbytes] = '\0'; + /* + * kernfs guarantees that a file isn't deleted with operations in + * flight, which means that the matching css is and stays alive and + * doesn't need to be pinned. The RCU locking is not necessary + * either. It's just for the convenience of using cgroup_css(). + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + rcu_read_unlock(); if (cft->write_string) { ret = cft->write_string(css, cft, strstrip(buf)); @@ -2342,53 +2148,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, } else { ret = -EINVAL; } -out_free: - kfree(buf); + return ret ?: nbytes; } -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_start) { - return cft->seq_start(seq, ppos); - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; - } + return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_next) { - return cft->seq_next(seq, v, ppos); - } else { - /* - * The same behavior and code as single_open(), always - * terminate after the initial read. - */ - ++*ppos; - return NULL; - } + return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_stop) - cft->seq_stop(seq, v); + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -2408,96 +2184,36 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return 0; } -static struct seq_operations cgroup_seq_operations = { - .start = cgroup_seqfile_start, - .next = cgroup_seqfile_next, - .stop = cgroup_seqfile_stop, - .show = cgroup_seqfile_show, +static struct kernfs_ops cgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_show = cgroup_seqfile_show, }; -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - struct cgroup_subsys_state *css; - struct cgroup_open_file *of; - int err; - - err = generic_file_open(inode, file); - if (err) - return err; - - /* - * If the file belongs to a subsystem, pin the css. Will be - * unpinned either on open failure or release. This ensures that - * @css stays alive for all file operations. - */ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - if (cft->ss && !css_tryget(css)) - css = NULL; - rcu_read_unlock(); - - if (!css) - return -ENODEV; - - /* - * @cfe->css is used by read/write/close to determine the - * associated css. @file->private_data would be a better place but - * that's already used by seqfile. Multiple accessors may use it - * simultaneously which is okay as the association never changes. - */ - WARN_ON_ONCE(cfe->css && cfe->css != css); - cfe->css = css; - - of = __seq_open_private(file, &cgroup_seq_operations, - sizeof(struct cgroup_open_file)); - if (of) { - of->cfe = cfe; - return 0; - } - - if (css->ss) - css_put(css); - return -ENOMEM; -} - -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (css->ss) - css_put(css); - return seq_release_private(inode, file); -} +static struct kernfs_ops cgroup_kf_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_start = cgroup_seqfile_start, + .seq_next = cgroup_seqfile_next, + .seq_stop = cgroup_seqfile_stop, + .seq_show = cgroup_seqfile_show, +}; /* * cgroup_rename - Only allow simple rename of directories in place. */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { - int ret; + struct cgroup *cgrp = kn->priv; struct cgroup_name *name, *old_name; - struct cgroup *cgrp; - - /* - * It's convinient to use parent dir's i_mutex to protected - * cgrp->name. - */ - lockdep_assert_held(&old_dir->i_mutex); + int ret; - if (!S_ISDIR(old_dentry->d_inode->i_mode)) + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) + if (kn->parent != new_parent) return -EIO; - cgrp = __d_cgrp(old_dentry); - /* * This isn't a proper migration and its usefulness is very * limited. Disallow if sane_behavior. @@ -2505,186 +2221,43 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_dentry->d_name.name); + name = cgroup_alloc_name(new_name_str); if (!name) return -ENOMEM; - ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - kfree(name); - return ret; + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) { + old_name = rcu_dereference_protected(cgrp->name, true); + rcu_assign_pointer(cgrp->name, name); + } else { + old_name = name; } - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); kfree_rcu(old_name, rcu_head); - return 0; -} - -static struct simple_xattrs *__d_xattrs(struct dentry *dentry) -{ - if (S_ISDIR(dentry->d_inode->i_mode)) - return &__d_cgrp(dentry)->xattrs; - else - return &__d_cfe(dentry)->xattrs; -} - -static inline int xattr_enabled(struct dentry *dentry) -{ - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return root->flags & CGRP_ROOT_XATTR; -} - -static bool is_valid_xattr(const char *name) -{ - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) - return true; - return false; -} - -static int cgroup_setxattr(struct dentry *dentry, const char *name, - const void *val, size_t size, int flags) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); -} - -static int cgroup_removexattr(struct dentry *dentry, const char *name) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_remove(__d_xattrs(dentry), name); -} - -static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, - void *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_get(__d_xattrs(dentry), name, buf, size); -} - -static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - return simple_xattr_list(__d_xattrs(dentry), buf, size); -} - -static const struct file_operations cgroup_file_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static const struct inode_operations cgroup_file_inode_operations = { - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - inc_nlink(dentry->d_parent->d_inode); - - /* - * Control reaches here with cgroup_mutex held. - * @inode->i_mutex should nest outside cgroup_mutex but we - * want to populate it immediately without releasing - * cgroup_mutex. As @inode isn't visible to anyone else - * yet, trylock will always succeed without affecting - * lockdep checks. - */ - WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - inode->i_op = &cgroup_file_inode_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; + return ret; } static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { - struct dentry *dir = cgrp->dentry; - struct cgroup *parent = __d_cgrp(dir); - struct dentry *dentry; - struct cfent *cfe; - int error; - umode_t mode; char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn; + struct lock_class_key *key = NULL; - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - - cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); - if (!cfe) - return -ENOMEM; - - cgroup_file_name(cgrp, cft, name); - dentry = lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; - } - - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); - - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); - if (!error) { - list_add_tail(&cfe->node, &parent->files); - cfe = NULL; - } - dput(dentry); -out: - kfree(cfe); - return error; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + key = &cft->lockdep_key; +#endif + kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), + cgroup_file_mode(cft), 0, cft->kf_ops, cft, + NULL, false, key); + if (IS_ERR(kn)) + return PTR_ERR(kn); + return 0; } /** @@ -2704,7 +2277,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_tree_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2749,9 +2321,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; - struct super_block *sb = ss->root->sb; struct cgroup *prev = NULL; - struct inode *inode; struct cgroup_subsys_state *css; u64 update_before; int ret = 0; @@ -2759,12 +2329,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) mutex_unlock(&cgroup_mutex); /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root || - !atomic_inc_not_zero(&sb->s_active)) { + if (!cfts || ss->root == &cgroup_dummy_root) { mutex_unlock(&cgroup_tree_mutex); return 0; } + cgroup_get_root(ss->root); + /* * All cgroups which are created after we drop cgroup_mutex will * have the updated set of files, so we only need to update the @@ -2779,18 +2350,16 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - inode = cgrp->dentry->d_inode; cgroup_get(cgrp); if (prev) cgroup_put(prev); prev = cgrp; - mutex_unlock(&cgroup_tree_mutex); - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) + if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) { ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&inode->i_mutex); + if (is_add) + kernfs_activate(cgrp->kn); + } if (ret) break; } @@ -2804,16 +2373,45 @@ static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft->name[0] != '\0'; cft++) + for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* free copy for custom atomic_write_len, see init_cftypes() */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) + kfree(cft->kf_ops); + cft->kf_ops = NULL; cft->ss = NULL; + } } -static void cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft->name[0] != '\0'; cft++) + for (cft = cfts; cft->name[0] != '\0'; cft++) { + struct kernfs_ops *kf_ops; + + if (cft->seq_start) + kf_ops = &cgroup_kf_ops; + else + kf_ops = &cgroup_kf_single_ops; + + /* + * Ugh... if @cft wants a custom max_write_len, we need to + * make a copy of kf_ops to set its atomic_write_len. + */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); + if (!kf_ops) { + cgroup_exit_cftypes(cfts); + return -ENOMEM; + } + kf_ops->atomic_write_len = cft->max_write_len; + } + + cft->kf_ops = kf_ops; cft->ss = ss; + } + + return 0; } /** @@ -2839,7 +2437,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (!set) return -ENOMEM; - cgroup_init_cftypes(ss, cfts); + ret = cgroup_init_cftypes(ss, cfts); + if (ret) + return ret; cgroup_cfts_prepare(); set->cfts = cfts; @@ -3706,21 +3306,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - int ret = -EINVAL; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp) { + rcu_read_unlock(); + return -ENOENT; + } css_task_iter_start(&cgrp->dummy_css, &it); while ((tsk = css_task_iter_next(&it))) { @@ -3745,8 +3351,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); -err: - return ret; + rcu_read_unlock(); + return 0; } @@ -3764,7 +3370,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -3819,7 +3425,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; if (l) @@ -3830,7 +3436,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; @@ -3880,21 +3486,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, return 0; } -/* - * When dput() is called asynchronously, if umount has been done and - * then deactivate_super() in cgroup_free_fn() kills the superblock, - * there's a small window that vfs will see the root dentry with non-zero - * refcnt and trigger BUG(). - * - * That's why we hold a reference before dput() and drop it right after. - */ -static void cgroup_dput(struct cgroup *cgrp) -{ - cgroup_get_root(cgrp->root); - cgroup_put(cgrp); - cgroup_put_root(cgrp->root); -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4029,7 +3620,7 @@ static void css_free_work_fn(struct work_struct *work) css_put(css->parent); css->ss->css_free(css); - cgroup_dput(cgrp); + cgroup_put(cgrp); } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4037,10 +3628,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) struct cgroup_subsys_state *css = container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context which we don't have. - */ INIT_WORK(&css->destroy_work, css_free_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } @@ -4122,7 +3709,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) struct cgroup_subsys_state *css; int err; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(cgroup_css(parent, ss)); @@ -4163,30 +3749,28 @@ err_free: return err; } -/* +/** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held + * @name_str: name of the new cgroup + * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) +static long cgroup_create(struct cgroup *parent, const char *name_str, + umode_t mode) { struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; - struct super_block *sb = root->sb; + struct kernfs_node *kn; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(dentry->d_name.name); + name = cgroup_alloc_name(name_str); if (!name) { err = -ENOMEM; goto err_free_cgrp; @@ -4217,18 +3801,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_unlock; } - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - cgroup_get_root(root); - init_cgroup_housekeeping(cgrp); - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; - cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; @@ -4239,15 +3813,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* - * Create directory. cgroup_create_file() returns with the new - * directory locked on success so that it can be populated without - * dropping cgroup_mutex. - */ - err = cgroup_create_file(dentry, S_IFDIR | mode, sb); - if (err < 0) + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp); + if (IS_ERR(kn)) { + err = PTR_ERR(kn); goto err_free_id; - lockdep_assert_held(&dentry->d_inode->i_mutex); + } + cgrp->kn = kn; cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4255,7 +3827,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* hold a ref to the parent's dentry */ + /* + * Grab a reference on the root and parent so that they don't get + * deleted while there are child cgroups. + */ + cgroup_get_root(root); cgroup_get(parent); /* @@ -4277,16 +3853,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + kernfs_activate(kn); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); - /* Release the reference count that we took on the superblock */ - cgroup_put_root(root); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -4300,16 +3875,15 @@ err_destroy: cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *c_parent = dentry->d_parent->d_fsdata; + struct cgroup *parent = parent_kn->priv; - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); + return cgroup_create(parent, name, mode); } /* @@ -4373,6 +3947,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { + /* + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. + */ cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* @@ -4421,13 +3999,12 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct dentry *d = cgrp->dentry; - struct cgroup_subsys_state *css; struct cgroup *child; + struct cgroup_subsys_state *css; + struct kernfs_node *kn; bool empty; int ssid; - lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4492,15 +4069,24 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); + /* remove @cgrp directory along with the base files */ + mutex_unlock(&cgroup_mutex); + /* - * Clear the base files and remove @cgrp directory. The removal - * puts the base ref but we aren't quite done with @cgrp yet, so - * hold onto it. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_from_dir(). Those are supported by RCU protecting + * clearing of cgrp->kn->priv backpointer, which should happen + * after all files under it have been removed. */ - mutex_unlock(&cgroup_mutex); - cgroup_addrm_files(cgrp, cgroup_base_files, false); - dget(d); - cgroup_d_remove_dir(d); + kn = cgrp->kn; + kernfs_get(kn); + + kernfs_remove(cgrp->kn); + + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + kernfs_put(kn); + mutex_lock(&cgroup_mutex); return 0; @@ -4531,19 +4117,46 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) check_for_release(parent); } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_rmdir(struct kernfs_node *kn) { - int ret; + struct cgroup *cgrp = kn->priv; + int ret = 0; + + /* + * This is self-destruction but @kn can't be removed while this + * callback is in progress. Let's break active protection. Once + * the protection is broken, @cgrp can be destroyed at any point. + * Pin it so that it stays accessible. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - ret = cgroup_destroy_locked(dentry->d_fsdata); + + /* + * @cgrp might already have been destroyed while we're trying to + * grab the mutexes. + */ + if (!cgroup_is_dead(cgrp)) + ret = cgroup_destroy_locked(cgrp); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); return ret; } +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; @@ -4635,11 +4248,7 @@ int __init cgroup_init(void) unsigned long key; int i, err; - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; - - cgroup_init_cftypes(NULL, cgroup_base_files); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); for_each_subsys(ss, i) { if (!ss->early_init) @@ -4669,24 +4278,17 @@ int __init cgroup_init(void) mutex_unlock(&cgroup_mutex); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; - } + if (!cgroup_kobj) + return -ENOMEM; err = register_filesystem(&cgroup_fs_type); if (err < 0) { kobject_put(cgroup_kobj); - goto out; + return err; } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; + return 0; } static int __init cgroup_wq_init(void) @@ -5095,18 +4697,25 @@ __setup("cgroup_disable=", cgroup_disable); struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; - struct cgroup_subsys_state *css; /* is @dentry a cgroup dir? */ - if (!dentry->d_inode || - dentry->d_inode->i_op != &cgroup_dir_inode_operations) + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); - cgrp = __d_cgrp(dentry); - css = cgroup_css(cgrp, ss); + /* + * This path doesn't originate from kernfs and @kn could already + * have been or be removed at any point. @kn->priv is RCU + * protected for this access. See destroy_locked() for details. + */ + cgrp = rcu_dereference(kn->priv); + if (cgrp) + css = cgroup_css(cgrp, ss); if (!css || !css_tryget(css)) css = ERR_PTR(-ENOENT); -- cgit v1.2.3 From 86bf4b68759141459864ebd36ac3038a9cda895b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:48 -0500 Subject: cgroup: warn if "xattr" is specified with "sane_behavior" Mount option "xattr" is no longer necessary as it's enabled by default on kernfs. Warn if "xattr" is specified with "sane_behavior" so that the option can be removed in the future. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0e45a932b823..305e94ee17f5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -264,6 +264,8 @@ enum { * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * + * - "xattr" mount option is deprecated. kernfs always enables it. + * * - cpuset: tasks will be kept in empty cpusets when hotplug happens * and take masks of ancestors with non-empty cpus/mems, instead of * being moved to an ancestor. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cda614da40cf..a0fab71f200f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1267,6 +1267,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); return -EINVAL; } + + if (opts->flags & CGRP_ROOT_XATTR) + pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n"); } /* -- cgit v1.2.3 From 0adb070426dde2fd0b84e7f4f5cefcd8f0b24410 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:48 -0500 Subject: cgroup: remove cftype_set cftype_set was added primarily to allow registering the same cftype array more than once for different subsystems. Nobody uses or needs such thing and it's already broken because each cftype has ->ss pointer which is initialized during registration. Let's add list_head ->node to cftype and use the first cftype entry in the array to link them instead of allocating separate cftype_set. While at it, trigger WARN if cft seems previously initialized during registration. This simplifies cftype handling a bit. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 26 +++++++++----------------- kernel/cgroup.c | 41 +++++++++++++---------------------------- 2 files changed, 22 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 305e94ee17f5..b42251a23129 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -412,12 +412,11 @@ struct cftype { unsigned int flags; /* - * The subsys this file belongs to. Initialized automatically - * during registration. NULL for cgroup core files. + * Fields used for internal bookkeeping. Initialized automatically + * during registration. */ - struct cgroup_subsys *ss; - - /* kernfs_ops to use, initialized automatically during registration */ + struct cgroup_subsys *ss; /* NULL for cgroup core files */ + struct list_head node; /* anchored at ss->cfts */ struct kernfs_ops *kf_ops; /* @@ -471,16 +470,6 @@ struct cftype { #endif }; -/* - * cftype_sets describe cftypes belonging to a subsystem and are chained at - * cgroup_subsys->cftsets. Each cftset points to an array of cftypes - * terminated by zero length name. - */ -struct cftype_set { - struct list_head node; /* chained at subsys->cftsets */ - struct cftype *cfts; -}; - /* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. @@ -597,8 +586,11 @@ struct cgroup_subsys { /* link to parent, protected by cgroup_lock() */ struct cgroupfs_root *root; - /* list of cftype_sets */ - struct list_head cftsets; + /* + * List of cftypes. Each entry is the first entry of an array + * terminated by zero length name. + */ + struct list_head cfts; /* base cftypes, automatically registered with subsys itself */ struct cftype *base_cftypes; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a2cbd1549995..506ebd61d1c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1016,12 +1016,12 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) int i; for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, set->cfts, false); + list_for_each_entry(cfts, &ss->cfts, node) + cgroup_addrm_files(cgrp, cfts, false); } } @@ -2392,6 +2392,8 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; + WARN_ON(cft->ss || cft->kf_ops); + if (cft->seq_start) kf_ops = &cgroup_kf_ops; else @@ -2430,26 +2432,15 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - struct cftype *found = NULL; - struct cftype_set *set; - if (!cfts || !cfts[0].ss) return -ENOENT; cgroup_cfts_prepare(); + list_del(&cfts->node); + cgroup_cfts_commit(cfts, false); - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - found = cfts; - break; - } - } - - cgroup_cfts_commit(found, false); cgroup_exit_cftypes(cfts); - return found ? 0 : -ENOENT; + return 0; } /** @@ -2468,20 +2459,14 @@ int cgroup_rm_cftypes(struct cftype *cfts) */ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { - struct cftype_set *set; int ret; - set = kzalloc(sizeof(*set), GFP_KERNEL); - if (!set) - return -ENOMEM; - ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_cfts_prepare(); - set->cfts = cfts; - list_add_tail(&set->node, &ss->cftsets); + list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_cfts_commit(cfts, true); if (ret) cgroup_rm_cftypes(cfts); @@ -3574,13 +3559,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* process cftsets of each subsystem */ for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, set->cfts, true); + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); if (ret < 0) goto err; } @@ -4169,7 +4154,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - INIT_LIST_HEAD(&ss->cftsets); + INIT_LIST_HEAD(&ss->cfts); /* Create the top cgroup state for this subsystem */ ss->root = &cgroup_dummy_root; -- cgit v1.2.3 From e61734c55c24cdf11b07e52a74aec4dc4a7f4bd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: remove cgroup->name cgroup->name handling became quite complicated over time involving dedicated struct cgroup_name for RCU protection. Now that cgroup is on kernfs, we can drop all of it and simply use kernfs_name/path() and friends. Replace cgroup->name and all related code with kernfs name/path constructs. * Reimplement cgroup_name() and cgroup_path() as thin wrappers on top of kernfs counterparts, which involves semantic changes. pr_cont_cgroup_name() and pr_cont_cgroup_path() added. * cgroup->name handling dropped from cgroup_rename(). * All users of cgroup_name/path() updated to the new semantics. Users which were formatting the string just to printk them are converted to use pr_cont_cgroup_name/path() instead, which simplifies things quite a bit. As cgroup_name() no longer requires RCU read lock around it, RCU lockings which were protecting only cgroup_name() are removed. v2: Comment above oom_info_lock updated as suggested by Michal. v3: dummy_top doesn't have a kn associated and pr_cont_cgroup_name/path() ended up calling the matching kernfs functions with NULL kn leading to oops. Test for NULL kn and print "/" if so. This issue was reported by Fengguang Wu. v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Fengguang Wu Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- block/blk-cgroup.h | 12 ++-- fs/kernfs/dir.c | 1 + include/linux/cgroup.h | 63 ++++++++++++--------- kernel/cgroup.c | 146 +++++++++++-------------------------------------- kernel/cpuset.c | 27 +++++---- kernel/sched/debug.c | 3 +- mm/memcontrol.c | 68 ++++++----------------- 7 files changed, 110 insertions(+), 210 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 453b528c8e19..15a8d640de57 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -241,12 +241,16 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) */ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { - int ret; + char *p; - ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (ret) + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { strncpy(buf, "", buflen); - return ret; + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; } /** diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index a347792c2e5a..939684ebff1e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -112,6 +112,7 @@ char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) spin_unlock_irqrestore(&kernfs_rename_lock, flags); return p; } +EXPORT_SYMBOL_GPL(kernfs_path); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b42251a23129..4d6ff7d40cf6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,11 +138,6 @@ enum { CGRP_SANE_BEHAVIOR, }; -struct cgroup_name { - struct rcu_head rcu_head; - char name[]; -}; - struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ @@ -179,19 +174,6 @@ struct cgroup { */ u64 serial_nr; - /* - * This is a copy of dentry->d_name, and it's needed because - * we can't use dentry->d_name in cgroup_path(). - * - * You must acquire rcu_read_lock() to access cgrp->name, and - * the only place that can change it is rename(), which is - * protected by parent dir's i_mutex. - * - * Normally you should use cgroup_name() wrapper rather than - * access it directly. - */ - struct cgroup_name __rcu *name; - /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -479,12 +461,6 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; } -/* Caller should hold rcu_read_lock() */ -static inline const char *cgroup_name(const struct cgroup *cgrp) -{ - return rcu_dereference(cgrp->name)->name; -} - /* returns ino associated with a cgroup, 0 indicates unmounted root */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { @@ -503,14 +479,47 @@ static inline struct cftype *seq_cft(struct seq_file *seq) struct cgroup_subsys_state *seq_css(struct seq_file *seq); +/* + * Name / path handling functions. All are thin wrappers around the kernfs + * counterparts and can be called under any context. + */ + +static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) +{ + return kernfs_name(cgrp->kn, buf, buflen); +} + +static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, + size_t buflen) +{ + return kernfs_path(cgrp->kn, buf, buflen); +} + +static inline void pr_cont_cgroup_name(struct cgroup *cgrp) +{ + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + pr_cont_kernfs_name(cgrp->kn); + else + pr_cont("/"); +} + +static inline void pr_cont_cgroup_path(struct cgroup *cgrp) +{ + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + pr_cont_kernfs_path(cgrp->kn); + else + pr_cont("/"); +} + +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); - int cgroup_task_count(const struct cgroup *cgrp); /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 59dfb025f1ac..638df032fb94 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -145,8 +145,6 @@ static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); -static struct cgroup_name root_cgroup_name = { .name = "/" }; - /* * Assign a monotonically increasing serial number to cgroups. It * guarantees cgroups with bigger numbers are newer than those with smaller @@ -888,17 +886,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct cgroup_name *cgroup_alloc_name(const char *name_str) -{ - struct cgroup_name *name; - - name = kmalloc(sizeof(*name) + strlen(name_str) + 1, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name->name, name_str); - return name; -} - static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { @@ -958,8 +945,6 @@ static void cgroup_free_fn(struct work_struct *work) cgroup_pidlist_destroy_all(cgrp); kernfs_put(cgrp->kn); - - kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -1377,7 +1362,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; - RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); } @@ -1597,57 +1581,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; -/** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Writes path of cgroup into buf. Returns 0 on success, -errno on error. - * - * We can't generate cgroup path using dentry->d_name, as accessing - * dentry->name must be protected by irq-unsafe dentry->d_lock or parent - * inode's i_mutex, while on the other hand cgroup_path() can be called - * with some irq-safe spinlocks held. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - int ret = -ENAMETOOLONG; - char *start; - - if (!cgrp->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - - start = buf + buflen - 1; - *start = '\0'; - - rcu_read_lock(); - do { - const char *name = cgroup_name(cgrp); - int len; - - len = strlen(name); - if ((start -= len) < buf) - goto out; - memcpy(start, name, len); - - if (--start < buf) - goto out; - *start = '/'; - - cgrp = cgrp->parent; - } while (cgrp->parent); - ret = 0; - memmove(buf, start, buf + buflen - start); -out: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_path); - /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -1659,16 +1592,14 @@ EXPORT_SYMBOL_GPL(cgroup_path); * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * - * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + * Return value is the same as kernfs_path(). */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroupfs_root *root; struct cgroup *cgrp; - int hierarchy_id = 1, ret = 0; - - if (buflen < 2) - return -ENAMETOOLONG; + int hierarchy_id = 1; + char *path = NULL; mutex_lock(&cgroup_mutex); @@ -1676,14 +1607,15 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path(cgrp, buf, buflen); + path = cgroup_path(cgrp, buf, buflen); } else { /* if no hierarchy exists, everyone is in "/" */ - memcpy(buf, "/", 2); + if (strlcpy(buf, "/", buflen) < buflen) + path = buf; } mutex_unlock(&cgroup_mutex); - return ret; + return path; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -2211,7 +2143,6 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; - struct cgroup_name *name, *old_name; int ret; if (kernfs_type(kn) != KERNFS_DIR) @@ -2226,25 +2157,13 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_name_str); - if (!name) - return -ENOMEM; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) { - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); - } else { - old_name = name; - } mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - - kfree_rcu(old_name, rcu_head); return ret; } @@ -3719,14 +3638,13 @@ err_free: /** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @name_str: name of the new cgroup + * @name: name of the new cgroup * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, const char *name_str, +static long cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup *cgrp; - struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; @@ -3737,13 +3655,6 @@ static long cgroup_create(struct cgroup *parent, const char *name_str, if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(name_str); - if (!name) { - err = -ENOMEM; - goto err_free_cgrp; - } - rcu_assign_pointer(cgrp->name, name); - mutex_lock(&cgroup_tree_mutex); /* @@ -3781,7 +3692,7 @@ static long cgroup_create(struct cgroup *parent, const char *name_str, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); /* create the directory */ - kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp); + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { err = PTR_ERR(kn); goto err_free_id; @@ -3839,8 +3750,6 @@ err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: mutex_unlock(&cgroup_tree_mutex); - kfree(rcu_dereference_raw(cgrp->name)); -err_free_cgrp: kfree(cgrp); return err; @@ -4304,12 +4213,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *path; int retval; struct cgroupfs_root *root; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -4337,10 +4246,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; goto out_unlock; - seq_puts(m, buf); + } + seq_puts(m, path); seq_putc(m, '\n'); } @@ -4588,16 +4499,17 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf = NULL, *agentbuf = NULL, *path; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) @@ -4605,7 +4517,7 @@ static void cgroup_release_agent(struct work_struct *work) i = 0; argv[i++] = agentbuf; - argv[i++] = pathbuf; + argv[i++] = path; argv[i] = NULL; i = 0; @@ -4755,6 +4667,11 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; read_lock(&css_set_lock); rcu_read_lock(); @@ -4763,14 +4680,17 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) struct cgroup *c = link->cgrp; const char *name = "?"; - if (c != cgroup_dummy_top) - name = cgroup_name(c); + if (c != cgroup_dummy_top) { + cgroup_name(c, name_buf, NAME_MAX + 1); + name = name_buf; + } seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } rcu_read_unlock(); read_unlock(&css_set_lock); + kfree(name_buf); return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2d018c795fea..e97a6e88d036 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2088,10 +2088,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - rcu_read_lock(); - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", - cgroup_name(cs->css.cgroup)); - rcu_read_unlock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); } } @@ -2619,19 +2618,17 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) /* Statically allocated to prevent using excess stack. */ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); - struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - rcu_read_lock(); spin_lock(&cpuset_buffer_lock); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_cont_cgroup_name(cgrp); + pr_cont(" mems_allowed=%s\n", cpuset_nodelist); spin_unlock(&cpuset_buffer_lock); - rcu_read_unlock(); } /* @@ -2681,12 +2678,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *p; struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -2696,14 +2693,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; + retval = -ENAMETOOLONG; rcu_read_lock(); css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); - if (retval < 0) + if (!p) goto out_put_task; - seq_puts(m, buf); + seq_puts(m, p); seq_putc(m, '\n'); + retval = 0; out_put_task: put_task_struct(tsk); out_free: diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10e..30eee3b5293d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 102ab48ffa13..c1c25494f7ae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1683,15 +1683,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* - * protects memcg_name and makes sure that parallel ooms do not - * interleave - */ + /* oom_info_lock ensures that parallel ooms do not interleave */ static DEFINE_SPINLOCK(oom_info_lock); - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - static char memcg_name[PATH_MAX]; - int ret; struct mem_cgroup *iter; unsigned int i; @@ -1701,36 +1694,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) spin_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, memory_cgrp_id); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - pr_info("Task in %s killed", memcg_name); - - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - /* - * Continues from above, so we don't need an KERN_ level - */ - pr_cont(" as a result of limit of %s\n", memcg_name); -done: - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, @@ -1745,13 +1716,8 @@ done: res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats"); - - rcu_read_lock(); - ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); - if (!ret) - pr_cont(" for %s", memcg_name); - rcu_read_unlock(); + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { struct kmem_cache *new = NULL; - static char *tmp_name = NULL; + static char *tmp_path = NULL, *tmp_name = NULL; static DEFINE_MUTEX(mutex); /* protects tmp_name */ BUG_ON(!memcg_can_account_kmem(memcg)); @@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, * This static temporary buffer is used to prevent from * pointless shortliving allocation. */ - if (!tmp_name) { - tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmp_path || !tmp_name) { + if (!tmp_path) + tmp_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!tmp_name) + tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmp_path || !tmp_name) goto out; } - rcu_read_lock(); - snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); - rcu_read_unlock(); + cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1); + snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), tmp_name); - new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, + new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; -- cgit v1.2.3 From 3c9c825b8b50de7dbb015e6bfc04bb2da79364d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: rename cgroupfs_root->number_of_cgroups to ->nr_cgrps and make it atomic_t root->number_of_cgroups is currently an integer protected with cgroup_mutex. Except for sanity checks and proc reporting, the only place it's used is to check whether the root has any child during remount; however, this is a bit flawed as the counter is not decremented when the cgroup is unlinked but when it's released, meaning that there could be an extended period where all cgroups are removed but remount is still not allowed because some internal objects are lingering. While not perfect either, it'd be better to use emptiness test on root->top_cgroup.children. This patch updates cgroup_remount() to test top_cgroup's children instead, which makes number_of_cgroups only actual usage statistics printing in proc implemented in proc_cgroupstats_show(). Let's shorten its name and make it an atomic_t so that we don't have to worry about its synchronization. It's purely auxiliary at this point. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4d6ff7d40cf6..f0e6105bbbc1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -290,8 +290,8 @@ struct cgroupfs_root { /* The root cgroup for this hierarchy */ struct cgroup top_cgroup; - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ + atomic_t nr_cgrps; /* A list running through the active hierarchies */ struct list_head root_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 638df032fb94..cffdb6e2ad08 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -758,7 +758,7 @@ static void cgroup_put_root(struct cgroupfs_root *root) } mutex_lock(&cgroup_mutex); - BUG_ON(root->number_of_cgroups != 1); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ @@ -928,9 +928,7 @@ static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - mutex_lock(&cgroup_mutex); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); + atomic_dec(&cgrp->root->nr_cgrps); /* * We get a ref to the parent, and put the ref when this cgroup is @@ -1320,7 +1318,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (root->number_of_cgroups > 1) { + if (!list_empty(&root->top_cgroup.children)) { ret = -EBUSY; goto out_unlock; } @@ -1360,7 +1358,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) atomic_set(&root->refcnt, 1); INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; + atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); @@ -1463,7 +1461,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root) write_unlock(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); ret = 0; @@ -3709,7 +3707,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; + atomic_inc(&root->nr_cgrps); /* * Grab a reference on the root and parent so that they don't get @@ -4281,7 +4279,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); + atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); return 0; -- cgit v1.2.3 From 776f02fa4e1ad70557c0318c70ce928e0642bee0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: remove cgroupfs_root->refcnt Currently, cgroupfs_root and its ->top_cgroup are separated reference counted and the latter's is ignored. There's no reason to do this separately. This patch removes cgroupfs_root->refcnt and destroys cgroupfs_root when the top_cgroup is released. * cgroup_put() updated to ignore cgroup_is_dead() test for top cgroups. cgroup_free_fn() updated to handle root destruction when releasing a top cgroup. * As root destruction is now bounced through cgroup destruction, it is asynchronous. Update cgroup_mount() so that it waits for pending release which is currently implemented using msleep(). Converting this to proper wait_queue isn't hard but likely unnecessary. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 4 +-- kernel/cgroup.c | 86 ++++++++++++++++++++++---------------------------- 2 files changed, 39 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f0e6105bbbc1..298d616e8f40 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -282,12 +282,10 @@ struct cgroupfs_root { /* The bitmask of subsystems attached to this hierarchy */ unsigned long subsys_mask; - atomic_t refcnt; - /* Unique id for this hierarchy. */ int hierarchy_id; - /* The root cgroup for this hierarchy */ + /* The root cgroup. Root is destroyed on its release. */ struct cgroup top_cgroup; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cffdb6e2ad08..03845c5d082b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -53,6 +53,7 @@ #include /* TODO: replace with more sophisticated array */ #include /* used in cgroup_attach_task */ #include +#include #include @@ -728,37 +729,16 @@ static void cgroup_free_root(struct cgroupfs_root *root) } } -static void cgroup_get_root(struct cgroupfs_root *root) -{ - /* - * The caller must ensure that @root is alive, which can be - * achieved by holding a ref on one of the member cgroups or - * following a registered reference to @root while holding - * cgroup_tree_mutex. - */ - WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0); - atomic_inc(&root->refcnt); -} - -static void cgroup_put_root(struct cgroupfs_root *root) +static void cgroup_destroy_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; int ret; - /* - * @root's refcnt reaching zero and its deregistration should be - * atomic w.r.t. cgroup_tree_mutex. This ensures that - * cgroup_get_root() is safe to invoke if @root is registered. - */ mutex_lock(&cgroup_tree_mutex); - if (!atomic_dec_and_test(&root->refcnt)) { - mutex_unlock(&cgroup_tree_mutex); - return; - } mutex_lock(&cgroup_mutex); - BUG_ON(atomic_read(&root->nr_cgrps) != 1); + BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ @@ -929,21 +909,24 @@ static void cgroup_free_fn(struct work_struct *work) struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); atomic_dec(&cgrp->root->nr_cgrps); - - /* - * We get a ref to the parent, and put the ref when this cgroup is - * being freed, so it's guaranteed that the parent won't be - * destroyed before its children. - */ - cgroup_put(cgrp->parent); - - /* put the root reference that we took when we created the cgroup */ - cgroup_put_root(cgrp->root); - cgroup_pidlist_destroy_all(cgrp); - kernfs_put(cgrp->kn); - kfree(cgrp); + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when this + * cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is top cgroup's refcnt reaching zero, which + * indicates that the root should be released. + */ + cgroup_destroy_root(cgrp->root); + } } static void cgroup_free_rcu(struct rcu_head *head) @@ -965,7 +948,7 @@ static void cgroup_put(struct cgroup *cgrp) { if (!atomic_dec_and_test(&cgrp->refcnt)) return; - if (WARN_ON_ONCE(!cgroup_is_dead(cgrp))) + if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; /* @@ -1356,7 +1339,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; - atomic_set(&root->refcnt, 1); INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; @@ -1485,7 +1467,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup_sb_opts opts; struct dentry *dentry; int ret; - +retry: mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -1531,7 +1513,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } } - cgroup_get_root(root); + /* + * A root's lifetime is governed by its top cgroup. Zero + * ref indicate that the root is being destroyed. Wait for + * destruction to complete so that the subsystems are free. + * We can use wait_queue for the wait but this path is + * super cold. Let's just sleep for a bit and retry. + */ + if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + msleep(10); + goto retry; + } + + ret = 0; goto out_unlock; } @@ -1558,7 +1554,7 @@ out_unlock: dentry = kernfs_mount(fs_type, flags, root->kf_root); if (IS_ERR(dentry)) - cgroup_put_root(root); + cgroup_put(&root->top_cgroup); return dentry; } @@ -1567,7 +1563,7 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); - cgroup_put_root(root); + cgroup_put(&root->top_cgroup); kernfs_kill_sb(sb); } @@ -3708,12 +3704,6 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); atomic_inc(&root->nr_cgrps); - - /* - * Grab a reference on the root and parent so that they don't get - * deleted while there are child cgroups. - */ - cgroup_get_root(root); cgroup_get(parent); /* -- cgit v1.2.3 From d3ba07c3aa9ae3e03329b0a7f1a067c0647aa2af Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:38 -0500 Subject: cgroup: disallow xattr, release_agent and name if sane_behavior Disallow more mount options if sane_behavior. Note that xattr used to generate warning. While at it, simplify option check in cgroup_mount() and update sane_behavior comment in cgroup.h. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 +++--- kernel/cgroup.c | 14 ++++---------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 298d616e8f40..5f669ca0ee36 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -227,8 +227,8 @@ enum { * * The followings are the behaviors currently affected this flag. * - * - Mount options "noprefix" and "clone_children" are disallowed. - * Also, cgroupfs file cgroup.clone_children is not created. + * - Mount options "noprefix", "xattr", "clone_children", + * "release_agent" and "name" are disallowed. * * - When mounting an existing superblock, mount options should * match. @@ -246,7 +246,7 @@ enum { * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * - * - "xattr" mount option is deprecated. kernfs always enables it. + * - "cgroup.clone_children" is removed. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens * and take masks of ancestors with non-empty cpus/mems, instead of diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 03845c5d082b..079c478a4735 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1226,18 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (opts->flags & CGRP_ROOT_NOPREFIX) { - pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || + opts->cpuset_clone_children || opts->release_agent || + opts->name) { + pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } - - if (opts->cpuset_clone_children) { - pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); - return -EINVAL; - } - - if (opts->flags & CGRP_ROOT_XATTR) - pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n"); } /* -- cgit v1.2.3 From 35585573055f37837eb752ee22eb5523682ca742 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:38 -0500 Subject: cgroup: drop CGRP_ROOT_SUBSYS_BOUND Before kernfs conversion, due to the way super_block lookup works, cgroup roots were created and made visible before being fully initialized. This in turn required a special flag to mark that the root hasn't been fully initialized so that the destruction path can tell fully bound ones from half initialized. That flag is CGRP_ROOT_SUBSYS_BOUND and no longer necessary after the kernfs conversion as the lookup and creation of new root are atomic w.r.t. cgroup_mutex. This patch removes the flag and passes the requests subsystem mask to cgroup_setup_root() so that it can set the respective mask bits as subsystems are bound. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 -- kernel/cgroup.c | 28 ++++------------------------ 2 files changed, 4 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5f669ca0ee36..e2ffcdc26cb7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -267,8 +267,6 @@ enum { /* mount options live below bit 16 */ CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, - - CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */ }; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 079c478a4735..878cd1810ad1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -733,7 +733,6 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; - int ret; mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -742,11 +741,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - } + WARN_ON(rebind_subsystems(root, 0, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1055,13 +1050,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } - /* - * Mark @root has finished binding subsystems. @root->subsys_mask - * now matches the bound subsystems. - */ - root->flags |= CGRP_ROOT_SUBSYS_BOUND; kernfs_activate(cgrp->kn); - return 0; } @@ -1353,15 +1342,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) init_cgroup_root(root); - /* - * We need to set @root->subsys_mask now so that @root can be - * matched by cgroup_test_super() before it finishes - * initialization; otherwise, competing mounts with the same - * options may try to bind the same subsystems instead of waiting - * for the first one leading to unexpected mount errors. - * SUBSYS_BOUND will be set once actual binding is complete. - */ - root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); @@ -1372,7 +1352,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static int cgroup_setup_root(struct cgroupfs_root *root) +static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->top_cgroup; @@ -1415,7 +1395,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root) if (ret) goto destroy_root; - ret = rebind_subsystems(root, root->subsys_mask, 0); + ret = rebind_subsystems(root, ss_mask, 0); if (ret) goto destroy_root; @@ -1532,7 +1512,7 @@ retry: goto out_unlock; } - ret = cgroup_setup_root(root); + ret = cgroup_setup_root(root, opts.subsys_mask); if (ret) cgroup_free_root(root); -- cgit v1.2.3 From 07bc356ed2950048d33d667e933e1b913c6e6b6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:39 -0500 Subject: cgroup: implement cgroup_has_tasks() and unexport cgroup_task_count() cgroup_task_count() read-locks css_set_lock and walks all tasks to count them and then returns the result. The only thing all the users want is determining whether the cgroup is empty or not. This patch implements cgroup_has_tasks() which tests whether cgroup->cset_links is empty, replaces all cgroup_task_count() usages and unexports it. Note that the test isn't synchronized. This is the same as before. The test has always been racy. This will help planned css_set locking update. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- include/linux/cgroup.h | 8 ++++++-- kernel/cgroup.c | 2 +- kernel/cpuset.c | 2 +- mm/memcontrol.c | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e2ffcdc26cb7..72154fb44fb5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -457,6 +457,12 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; } +/* no synchronization, the result can only be used as a hint */ +static inline bool cgroup_has_tasks(struct cgroup *cgrp) +{ + return !list_empty(&cgrp->cset_links); +} + /* returns ino associated with a cgroup, 0 indicates unmounted root */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { @@ -516,8 +522,6 @@ int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); -int cgroup_task_count(const struct cgroup *cgrp); - /* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2469699408bd..ec7746e5ded1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2391,7 +2391,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); * * Return the number of tasks in the cgroup. */ -int cgroup_task_count(const struct cgroup *cgrp) +static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e97a6e88d036..ae190b0a196a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1c25494f7ae..d9c6ac1532e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4958,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) struct cgroup *cgrp = memcg->css.cgroup; /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) + if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) return -EBUSY; /* we call try-to-free pages for make this cgroup empty */ @@ -5140,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) + if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) err = -EBUSY; mutex_unlock(&memcg_create_mutex); if (err) -- cgit v1.2.3 From 889ed9ceaa97bb02bf5d7349e24639f7fc5f4fa0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:40 -0500 Subject: cgroup: remove css_scan_tasks() css_scan_tasks() doesn't have any user left. Remove it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 -- kernel/cgroup.c | 162 ------------------------------------------------- 2 files changed, 168 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 72154fb44fb5..3bd0a7138371 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -813,11 +812,6 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap); - int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 89428b9d9933..05c0c23549f9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2697,168 +2697,6 @@ void css_task_iter_end(struct css_task_iter *it) up_read(&css_set_rwsem); } -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - -/** - * css_scan_tasks - iterate though all the tasks in a css - * @css: the css to iterate tasks of - * @test: optional test callback - * @process: process callback - * @data: data passed to @test and @process - * @heap: optional pre-allocated heap used for task iteration - * - * Iterate through all the tasks in @css, calling @test for each, and if it - * returns %true, call @process for it also. - * - * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_rwsem for the call to @process. - * - * It is guaranteed that @process will act on every task that is a member - * of @css for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different css during the - * call, or are forked or move into the css during the call. - * - * Note that @test may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should be - * cheap. - * - * If @heap is non-NULL, a heap has been pre-allocated and will be used for - * heap operations (and its "gt" member will be overwritten), else a - * temporary heap will be used (allocation of which may cause this function - * to fail). - */ -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) -{ - int retval, i; - struct css_task_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct timespec latest_time = { 0, 0 }; - - if (heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } - - again: - /* - * Scan tasks in the css, using the @test callback to determine - * which are of interest, and invoking @process callback on the - * ones which need an update. Since we don't want to hold any - * locks during the task updates, gather tasks to be processed in a - * heap structure. The heap is sorted by descending task start - * time. If the statically-sized heap fills up, we overflow tasks - * that started later, and in future iterations only consider tasks - * that started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (test && !test(p, data)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - css_task_iter_end(&it); - - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - process(q, data); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} - /** * cgroup_trasnsfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved -- cgit v1.2.3 From 924f0d9a2078f49ff331bb43196ec5afadc16b8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:41 -0500 Subject: cgroup: drop @skip_css from cgroup_taskset_for_each() If !NULL, @skip_css makes cgroup_taskset_for_each() skip the matching css. The intention of the interface is to make it easy to skip css's (cgroup_subsys_states) which already match the migration target; however, this is entirely unnecessary as migration taskset doesn't include tasks which are already in the target cgroup. Drop @skip_css from cgroup_taskset_for_each(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann --- block/blk-cgroup.c | 2 +- include/linux/cgroup.h | 8 ++------ kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 4 ++-- kernel/events/core.c | 2 +- kernel/sched/core.c | 4 ++-- net/core/netclassid_cgroup.c | 2 +- net/core/netprio_cgroup.c | 2 +- 8 files changed, 11 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1cef07cf9c21..4aefd46d7d95 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -894,7 +894,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3bd0a7138371..581a124c7bc8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -535,15 +535,11 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor - * @skip_css: skip if task's css matches this, %NULL to iterate through all * @tset: taskset to iterate */ -#define cgroup_taskset_for_each(task, skip_css, tset) \ +#define cgroup_taskset_for_each(task, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) \ - if (!(skip_css) || \ - cgroup_taskset_cur_css((tset), \ - (skip_css)->ss->id) != (skip_css)) + (task) = cgroup_taskset_next((tset))) /* * Control Group subsystem type. diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 98ea26a99076..7201a637c405 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -187,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css, tset) { + cgroup_taskset_for_each(task, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 65ae0bdf4af8..bf20e4ac2f75 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1398,7 +1398,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1467,7 +1467,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here diff --git a/kernel/events/core.c b/kernel/events/core.c index a3c3ab50271a..6dd714955b04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8021,7 +8021,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d4cfc5561830..ba386a06ab11 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7600,7 +7600,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7618,7 +7618,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index b865662fba71..22931e1b99b4 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css, void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; - cgroup_taskset_for_each(p, css, tset) { + cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index d7d23e28fafd..f9f3a40d3350 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, css, tset) { + cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); -- cgit v1.2.3 From bc668c7519ff8b4681af80e92f463bec7bf7cf9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:43 -0500 Subject: cgroup: remove cgroup_taskset_cur_css() and cgroup_taskset_size() The two functions don't have any users left. Remove them along with cgroup_taskset->cur_cgrp. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 --- kernel/cgroup.c | 30 ------------------------------ 2 files changed, 33 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 581a124c7bc8..ef0b3af0e61c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -528,9 +528,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id); -int cgroup_taskset_size(struct cgroup_taskset *tset); /** * cgroup_taskset_for_each - iterate cgroup_taskset diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 704c590a81d7..a9d9bbb12310 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1647,7 +1647,6 @@ struct cgroup_taskset { struct flex_array *tc_array; int tc_array_len; int idx; - struct cgroup *cur_cgrp; }; /** @@ -1662,7 +1661,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) tset->idx = 0; return cgroup_taskset_next(tset); } else { - tset->cur_cgrp = tset->single.cgrp; return tset->single.task; } } @@ -1683,38 +1681,10 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) return NULL; tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; return tc->task; } EXPORT_SYMBOL_GPL(cgroup_taskset_next); -/** - * cgroup_taskset_cur_css - return the matching css for the current task - * @tset: taskset of interest - * @subsys_id: the ID of the target subsystem - * - * Return the css for the current (last returned) task of @tset for - * subsystem specified by @subsys_id. This function must be preceded by - * either cgroup_taskset_first() or cgroup_taskset_next(). - */ -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id) -{ - return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); - -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - - /** * cgroup_task_migrate - move a task from one cgroup to another. * @old_cgrp; the cgroup @tsk is being migrated from -- cgit v1.2.3 From cc045e3952175e84c38dad22dea14465b9fc8fb5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Feb 2014 16:56:04 +0800 Subject: cgroup: deal with dummp_top in cgroup_name() and cgroup_path() My kernel fails to boot, because blkcg calls cgroup_path() while cgroupfs is not mounted. Fix both cgroup_name() and cgroup_path(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ef0b3af0e61c..8c283a910b91 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -487,13 +487,21 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq); static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { - return kernfs_name(cgrp->kn, buf, buflen); + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + return kernfs_name(cgrp->kn, buf, buflen); + else + return strlcpy(buf, "/", buflen); } static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { - return kernfs_path(cgrp->kn, buf, buflen); + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + return kernfs_path(cgrp->kn, buf, buflen); + strlcpy(buf, "/", buflen); + return (buflen <= 2) ? NULL : buf; } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) -- cgit v1.2.3 From c75611282cf1bf717c1866e7a7eb4d0743815187 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:01 -0500 Subject: cgroup: add css_set->mg_tasks Currently, while migrating tasks from one cgroup to another, cgroup_attach_task() builds a flex array of all target tasks; unfortunately, this has a couple issues. * Flex array has size limit. On 64bit, struct task_and_cgroup is 24bytes making the flex element limit around 87k. It is a high number but not impossible to hit. This means that the current cgroup implementation can't migrate a process with more than 87k threads. * Process migration involves memory allocation whose size is dependent on the number of threads the process has. This means that cgroup core can't guarantee success or failure of multi-process migrations as memory allocation failure can happen in the middle. This is in part because cgroup can't grab threadgroup locks of multiple processes at the same time, so when there are multiple processes to migrate, it is imposible to tell how many tasks are to be migrated beforehand. Note that this already affects cgroup_transfer_tasks(). cgroup currently cannot guarantee atomic success or failure of the operation. It may fail in the middle and after such failure cgroup doesn't have enough information to roll back properly. It just aborts with some tasks migrated and others not. To resolve the situation, we're going to use task->cg_list during migration too. Instead of building a separate array, target tasks will be linked into a dedicated migration list_head on the owning css_set. Tasks on the migration list are treated the same as tasks on the usual tasks list; however, being on a separate list allows cgroup migration code path to keep track of the target tasks by simply keeping the list of css_sets with tasks being migrated, making unpredictable dynamic allocation unnecessary. In prepartion of such migration path update, this patch introduces css_set->mg_tasks list and updates css_set task iterations so that they walk both css_set->tasks and ->mg_tasks. Note that ->mg_tasks isn't used yet. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 8 ++++++-- kernel/cgroup.c | 56 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8c283a910b91..528e2aed36c3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -324,10 +324,14 @@ struct css_set { struct hlist_node hlist; /* - * List running through all tasks using this cgroup - * group. Protected by css_set_lock + * Lists running through all tasks using this cgroup group. + * mg_tasks lists tasks which belong to this cset but are in the + * process of being migrated out or in. Protected by + * css_set_rwsem, but, during migration, once tasks are moved to + * mg_tasks, it can be read safely while holding cgroup_mutex. */ struct list_head tasks; + struct list_head mg_tasks; /* * List of cgrp_cset_links pointing at cgroups referenced from this diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8ab800c7bac0..b80c611ff836 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, atomic_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); + INIT_LIST_HEAD(&cset->mg_tasks); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in @@ -2590,9 +2591,14 @@ static void css_advance_task_iter(struct css_task_iter *it) } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; - } while (list_empty(&cset->tasks)); + } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + it->cset_link = l; - it->task = cset->tasks.next; + + if (!list_empty(&cset->tasks)) + it->task = cset->tasks.next; + else + it->task = cset->mg_tasks.next; } /** @@ -2636,24 +2642,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; - struct cgrp_cset_link *link; + struct cgrp_cset_link *link = list_entry(it->cset_link, + struct cgrp_cset_link, cset_link); /* If the iterator cg is NULL, we have no tasks */ if (!it->cset_link) return NULL; res = list_entry(l, struct task_struct, cg_list); - /* Advance iterator to find next entry */ + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ l = l->next; - link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); - if (l == &link->cset->tasks) { - /* - * We reached the end of this task list - move on to the - * next cgrp_cset_link. - */ + + if (l == &link->cset->tasks) + l = link->cset->mg_tasks.next; + + if (l == &link->cset->mg_tasks) css_advance_task_iter(it); - } else { + else it->task = l; - } + return res; } @@ -4502,16 +4513,23 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + seq_printf(seq, "css_set %p\n", cset); + list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) { - seq_puts(seq, " ...\n"); - break; - } else { - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); } + continue; + overflow: + seq_puts(seq, " ...\n"); } up_read(&css_set_rwsem); return 0; -- cgit v1.2.3 From b3dc094e93905ae9c1bc0815402ad8e5b203d068 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:01 -0500 Subject: cgroup: use css_set->mg_tasks to track target tasks during migration Currently, while migrating tasks from one cgroup to another, cgroup_attach_task() builds a flex array of all target tasks; unfortunately, this has a couple issues. * Flex array has size limit. On 64bit, struct task_and_cgroup is 24bytes making the flex element limit around 87k. It is a high number but not impossible to hit. This means that the current cgroup implementation can't migrate a process with more than 87k threads. * Process migration involves memory allocation whose size is dependent on the number of threads the process has. This means that cgroup core can't guarantee success or failure of multi-process migrations as memory allocation failure can happen in the middle. This is in part because cgroup can't grab threadgroup locks of multiple processes at the same time, so when there are multiple processes to migrate, it is imposible to tell how many tasks are to be migrated beforehand. Note that this already affects cgroup_transfer_tasks(). cgroup currently cannot guarantee atomic success or failure of the operation. It may fail in the middle and after such failure cgroup doesn't have enough information to roll back properly. It just aborts with some tasks migrated and others not. To resolve the situation, this patch updates the migration path to use task->cg_list to track target tasks. The previous patch already added css_set->mg_tasks and updated iterations in non-migration paths to include them during task migration. This patch updates migration path to actually make use of it. Instead of putting onto a flex_array, each target task is moved from its css_set->tasks list to css_set->mg_tasks and the migration path keeps trace of all the source css_sets and the associated cgroups. Once all source css_sets are determined, the destination css_set for each is determined, linked to the matching source css_set and put on a separate list. To iterate the target tasks, migration path just needs to iterat through either the source or target css_sets, depending on whether migration has been committed or not, and the tasks on their ->mg_tasks lists. cgroup_taskset is updated to contain the list_heads for source and target css_sets and the iteration cursor. cgroup_taskset_*() are accordingly updated to walk through css_sets and their ->mg_tasks. This resolves the above listed issues with moderate additional complexity. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 16 ++++ kernel/cgroup.c | 223 +++++++++++++++++++++++++------------------------ 2 files changed, 131 insertions(+), 108 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528e2aed36c3..3a1cb265afd6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -346,6 +346,22 @@ struct css_set { */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + /* + * List of csets participating in the on-going migration either as + * source or destination. Protected by cgroup_mutex. + */ + struct list_head mg_node; + + /* + * If this cset is acting as the source of migration the following + * two fields are set. mg_src_cgrp is the source cgroup of the + * on-going migration and mg_dst_cset is the destination cset the + * target tasks on this cset should be migrated to. Protected by + * cgroup_mutex. + */ + struct cgroup *mg_src_cgrp; + struct css_set *mg_dst_cset; + /* For RCU-protected deletion */ struct rcu_head rcu_head; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b80c611ff836..5def4a800425 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,6 @@ #include #include #include /* TODO: replace with more sophisticated array */ -#include /* used in cgroup_attach_task */ #include #include @@ -645,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->mg_node); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in @@ -1639,20 +1639,26 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* - * Control Group taskset - */ -struct task_and_cgroup { - struct task_struct *task; - struct cgroup *cgrp; - struct css_set *cset; -}; - +/* used to track tasks and other necessary states during migration */ struct cgroup_taskset { - struct task_and_cgroup single; - struct flex_array *tc_array; - int tc_array_len; - int idx; + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; }; /** @@ -1663,12 +1669,10 @@ struct cgroup_taskset { */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) { - if (tset->tc_array) { - tset->idx = 0; - return cgroup_taskset_next(tset); - } else { - return tset->single.task; - } + tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); + tset->cur_task = NULL; + + return cgroup_taskset_next(tset); } /** @@ -1680,13 +1684,27 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) { - struct task_and_cgroup *tc; + struct css_set *cset = tset->cur_cset; + struct task_struct *task = tset->cur_task; - if (!tset->tc_array || tset->idx >= tset->tc_array_len) - return NULL; + while (&cset->mg_node != tset->csets) { + if (!task) + task = list_first_entry(&cset->mg_tasks, + struct task_struct, cg_list); + else + task = list_next_entry(task, cg_list); - tc = flex_array_get(tset->tc_array, tset->idx++); - return tc->task; + if (&task->cg_list != &cset->mg_tasks) { + tset->cur_cset = cset; + tset->cur_task = task; + return task; + } + + cset = list_next_entry(cset, mg_node); + task = NULL; + } + + return NULL; } /** @@ -1714,11 +1732,13 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); + get_css_set(new_cset); + task_lock(tsk); rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - list_move(&tsk->cg_list, &new_cset->tasks); + list_move(&tsk->cg_list, &new_cset->mg_tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1741,80 +1761,58 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, bool threadgroup) { - int ret, i, group_size; - struct cgroupfs_root *root = cgrp->root; + struct cgroup_taskset tset = { + .src_csets = LIST_HEAD_INIT(tset.src_csets), + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), + .csets = &tset.src_csets, + }; struct cgroup_subsys_state *css, *failed_css = NULL; - /* threadgroup list cursor and array */ - struct task_struct *task; - struct task_and_cgroup *tc; - struct flex_array *group; - struct cgroup_taskset tset = { }; - - /* - * step 0: in order to do expensive, possibly blocking operations for - * every thread, we cannot iterate the thread group list, since it needs - * rcu or tasklist locked. instead, build an array of all threads in the - * group - group_rwsem prevents new threads from appearing, and if - * threads exit, this will just be an over-estimate. - */ - if (threadgroup) - group_size = get_nr_threads(leader); - else - group_size = 1; - /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); - if (!group) - return -ENOMEM; - /* pre-allocate to guarantee space while iterating in rcu read-side. */ - ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (ret) - goto out_free_group_list; + struct css_set *cset, *tmp_cset; + struct task_struct *task, *tmp_task; + int i, ret; - i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - down_read(&css_set_rwsem); + down_write(&css_set_rwsem); rcu_read_lock(); task = leader; do { - struct task_and_cgroup ent; + struct cgroup *src_cgrp; /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) goto next; - /* as per above, nr_threads may decrease, but not increase. */ - BUG_ON(i >= group_size); - ent.task = task; - ent.cgrp = task_cgroup_from_root(task, root); + cset = task_css_set(task); + src_cgrp = task_cgroup_from_root(task, cgrp->root); + /* nothing to do if this task is already in the cgroup */ - if (ent.cgrp == cgrp) + if (src_cgrp == cgrp) goto next; - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ - ret = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(ret != 0); - i++; + + if (!cset->mg_src_cgrp) { + WARN_ON(!list_empty(&cset->mg_tasks)); + WARN_ON(!list_empty(&cset->mg_node)); + + cset->mg_src_cgrp = src_cgrp; + list_add(&cset->mg_node, &tset.src_csets); + get_css_set(cset); + } + + list_move(&task->cg_list, &cset->mg_tasks); next: if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); - up_read(&css_set_rwsem); - /* remember the number of threads in the array for later. */ - group_size = i; - tset.tc_array = group; - tset.tc_array_len = group_size; + up_write(&css_set_rwsem); /* methods shouldn't be called if no task is actually migrating */ - ret = 0; - if (!group_size) - goto out_free_group_list; + if (list_empty(&tset.src_csets)) + return 0; /* * step 1: check that we can legitimately attach to the cgroup. @@ -1833,16 +1831,21 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - for (i = 0; i < group_size; i++) { - struct css_set *old_cset; + list_for_each_entry(cset, &tset.src_csets, mg_node) { + struct css_set *dst_cset; - tc = flex_array_get(group, i); - old_cset = task_css_set(tc->task); - tc->cset = find_css_set(old_cset, cgrp); - if (!tc->cset) { + dst_cset = find_css_set(cset, cgrp); + if (!dst_cset) { ret = -ENOMEM; - goto out_put_css_set_refs; + goto out_release_tset; } + + if (list_empty(&dst_cset->mg_node)) + list_add(&dst_cset->mg_node, &tset.dst_csets); + else + put_css_set(dst_cset, false); + + cset->mg_dst_cset = dst_cset; } /* @@ -1851,12 +1854,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, * failure cases after here, so this is the commit point. */ down_write(&css_set_rwsem); - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); + list_for_each_entry(cset, &tset.src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) + cgroup_task_migrate(cset->mg_src_cgrp, task, + cset->mg_dst_cset); } up_write(&css_set_rwsem); - /* nothing is sensitive to fork() after this point. */ + + /* migration is committed, all target tasks are now on dst_csets */ + tset.csets = &tset.dst_csets; + + /* nothing is sensitive to fork() after this point */ /* * step 4: do subsystem attach callbacks. @@ -1865,30 +1873,27 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, if (css->ss->attach) css->ss->attach(css, &tset); - /* - * step 5: success! and cleanup - */ ret = 0; -out_put_css_set_refs: - if (ret) { - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - if (!tc->cset) - break; - put_css_set(tc->cset, false); - } - } + goto out_release_tset; + out_cancel_attach: - if (ret) { - for_each_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } + for_each_css(css, i, cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } -out_free_group_list: - flex_array_free(group); +out_release_tset: + down_write(&css_set_rwsem); + list_splice_init(&tset.dst_csets, &tset.src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { + list_splice_init(&cset->mg_tasks, &cset->tasks); + cset->mg_dst_cset = NULL; + cset->mg_src_cgrp = NULL; + list_del_init(&cset->mg_node); + put_css_set_locked(cset, false); + } + up_write(&css_set_rwsem); return ret; } @@ -3895,6 +3900,8 @@ int __init cgroup_init_early(void) atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); + INIT_LIST_HEAD(&init_css_set.mg_tasks); + INIT_LIST_HEAD(&init_css_set.mg_node); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&cgroup_dummy_root); -- cgit v1.2.3 From 1958d2d53dadbb1c9aaf0b37741f13a60098b243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:03 -0500 Subject: cgroup: split process / task migration into four steps Currently, process / task migration is a single operation which may fail depending on memory pressure or the involved controllers' ->can_attach() callbacks. One problem with this approach is migration of multiple targets. It's impossible to tell whether a given target will be successfully migrated beforehand and cgroup core can't keep track of enough states to roll back after intermediate failure. This is already an issue with cgroup_transfer_tasks(). Also, we're gonna need multiple target migration for unified hierarchy. This patch splits migration into four stages - cgroup_migrate_add_src(), cgroup_migrate_prepare_dst(), cgroup_migrate() and cgroup_migrate_finish(), where cgroup_migrate_prepare_dst() performs all the operations which may fail due to allocation failure without actually migrating the target. The four separate stages mean that, disregarding ->can_attach() failures, the success or failure of multi target migration can be determined before performing any actual migration. If preparations of all targets succeed, the whole thing will succeed. If not, the whole operation can fail without any side-effect. Since the previous patch to use css_set->mg_tasks to keep track of migration targets, the only thing which may need memory allocation during migration is the target css_sets. cgroup_migrate_prepare() pins all source and target css_sets and link them up. Note that this can be performed without holding threadgroup_lock even if the target is a process. As long as cgroup_mutex is held, no new css_set can be put into play. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 240 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 182 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3a1cb265afd6..4829a577c1b9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -350,6 +350,7 @@ struct css_set { * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ + struct list_head mg_preload_node; struct list_head mg_node; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 23e3a8c74bd4..a93f6f1ebc69 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->mg_preload_node); INIT_LIST_HEAD(&cset->mg_node); INIT_HLIST_NODE(&cset->hlist); @@ -1755,16 +1756,137 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, } /** - * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup - * @cgrp: the cgroup to attach to - * @leader: the task or the leader of the threadgroup to be attached - * @threadgroup: attach the whole threadgroup? + * cgroup_migrate_finish - cleanup after attach + * @preloaded_csets: list of preloaded css_sets * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of @tsk or each thread in the threadgroup individually in turn. + * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See + * those functions for details. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, - bool threadgroup) +static void cgroup_migrate_finish(struct list_head *preloaded_csets) +{ + struct css_set *cset, *tmp_cset; + + lockdep_assert_held(&cgroup_mutex); + + down_write(&css_set_rwsem); + list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_preload_node); + put_css_set_locked(cset, false); + } + up_write(&css_set_rwsem); +} + +/** + * cgroup_migrate_add_src - add a migration source css_set + * @src_cset: the source css_set to add + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded css_sets + * + * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin + * @src_cset and add it to @preloaded_csets, which should later be cleaned + * up by cgroup_migrate_finish(). + * + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. + */ +static void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + struct cgroup *src_cgrp; + + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + + /* nothing to do if this cset already belongs to the cgroup */ + if (src_cgrp == dst_cgrp) + return; + + if (!list_empty(&src_cset->mg_preload_node)) + return; + + WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(!list_empty(&src_cset->mg_tasks)); + WARN_ON(!list_empty(&src_cset->mg_node)); + + src_cset->mg_src_cgrp = src_cgrp; + get_css_set(src_cset); + list_add(&src_cset->mg_preload_node, preloaded_csets); +} + +/** + * cgroup_migrate_prepare_dst - prepare destination css_sets for migration + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded source css_sets + * + * Tasks are about to be moved to @dst_cgrp and all the source css_sets + * have been preloaded to @preloaded_csets. This function looks up and + * pins all destination css_sets, links each to its source, and put them on + * @preloaded_csets. + * + * This function must be called after cgroup_migrate_add_src() has been + * called on each migration source css_set. After migration is performed + * using cgroup_migrate(), cgroup_migrate_finish() must be called on + * @preloaded_csets. + */ +static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + LIST_HEAD(csets); + struct css_set *src_cset; + + lockdep_assert_held(&cgroup_mutex); + + /* look up the dst cset for each src cset and link it to src */ + list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + struct css_set *dst_cset; + + dst_cset = find_css_set(src_cset, dst_cgrp); + if (!dst_cset) + goto err; + + WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + src_cset->mg_dst_cset = dst_cset; + + if (list_empty(&dst_cset->mg_preload_node)) + list_add(&dst_cset->mg_preload_node, &csets); + else + put_css_set(dst_cset, false); + } + + list_splice(&csets, preloaded_csets); + return 0; +err: + cgroup_migrate_finish(&csets); + return -ENOMEM; +} + +/** + * cgroup_migrate - migrate a process or task to a cgroup + * @cgrp: the destination cgroup + * @leader: the leader of the process or the task to migrate + * @threadgroup: whether @leader points to the whole process or a single task + * + * Migrate a process or task denoted by @leader to @cgrp. If migrating a + * process, the caller must be holding threadgroup_lock of @leader. The + * caller is also responsible for invoking cgroup_migrate_add_src() and + * cgroup_migrate_prepare_dst() on the targets before invoking this + * function and following up with cgroup_migrate_finish(). + * + * As long as a controller's ->can_attach() doesn't fail, this function is + * guaranteed to succeed. This means that, excluding ->can_attach() + * failure, when migrating multiple targets, the success or failure can be + * decided for all targets by invoking group_migrate_prepare_dst() before + * actually starting migrating. + */ +static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, + bool threadgroup) { struct cgroup_taskset tset = { .src_csets = LIST_HEAD_INIT(tset.src_csets), @@ -1785,29 +1907,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, rcu_read_lock(); task = leader; do { - struct cgroup *src_cgrp; - /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) goto next; cset = task_css_set(task); - src_cgrp = task_cgroup_from_root(task, cgrp->root); - - /* nothing to do if this task is already in the cgroup */ - if (src_cgrp == cgrp) + if (!cset->mg_src_cgrp) goto next; - if (!cset->mg_src_cgrp) { - WARN_ON(!list_empty(&cset->mg_tasks)); - WARN_ON(!list_empty(&cset->mg_node)); - - cset->mg_src_cgrp = src_cgrp; - list_add(&cset->mg_node, &tset.src_csets); - get_css_set(cset); - } - list_move(&task->cg_list, &cset->mg_tasks); + list_move(&cset->mg_node, &tset.src_csets); + list_move(&cset->mg_dst_cset->mg_node, &tset.dst_csets); next: if (!threadgroup) break; @@ -1819,9 +1929,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, if (list_empty(&tset.src_csets)) return 0; - /* - * step 1: check that we can legitimately attach to the cgroup. - */ + /* check that we can legitimately attach to the cgroup */ for_each_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); @@ -1833,30 +1941,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, } /* - * step 2: make sure css_sets exist for all threads to be migrated. - * we use find_css_set, which allocates a new one if necessary. - */ - list_for_each_entry(cset, &tset.src_csets, mg_node) { - struct css_set *dst_cset; - - dst_cset = find_css_set(cset, cgrp); - if (!dst_cset) { - ret = -ENOMEM; - goto out_release_tset; - } - - if (list_empty(&dst_cset->mg_node)) - list_add(&dst_cset->mg_node, &tset.dst_csets); - else - put_css_set(dst_cset, false); - - cset->mg_dst_cset = dst_cset; - } - - /* - * step 3: now that we're guaranteed success wrt the css_sets, - * proceed to move all tasks to the new cgroup. There are no - * failure cases after here, so this is the commit point. + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. */ down_write(&css_set_rwsem); list_for_each_entry(cset, &tset.src_csets, mg_node) { @@ -1866,14 +1953,13 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, } up_write(&css_set_rwsem); - /* migration is committed, all target tasks are now on dst_csets */ - tset.csets = &tset.dst_csets; - - /* nothing is sensitive to fork() after this point */ - /* - * step 4: do subsystem attach callbacks. + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. */ + tset.csets = &tset.dst_csets; + for_each_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -1893,15 +1979,50 @@ out_release_tset: list_splice_init(&tset.dst_csets, &tset.src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { list_splice_init(&cset->mg_tasks, &cset->tasks); - cset->mg_dst_cset = NULL; - cset->mg_src_cgrp = NULL; list_del_init(&cset->mg_node); - put_css_set_locked(cset, false); } up_write(&css_set_rwsem); return ret; } +/** + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup + * @dst_cgrp: the cgroup to attach to + * @leader: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? + * + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take + * task_lock of @tsk or each thread in the threadgroup individually in turn. + */ +static int cgroup_attach_task(struct cgroup *dst_cgrp, + struct task_struct *leader, bool threadgroup) +{ + LIST_HEAD(preloaded_csets); + struct task_struct *task; + int ret; + + /* look up all src csets */ + down_read(&css_set_rwsem); + rcu_read_lock(); + task = leader; + do { + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, + &preloaded_csets); + if (!threadgroup) + break; + } while_each_thread(leader, task); + rcu_read_unlock(); + up_read(&css_set_rwsem); + + /* prepare dst csets and commit */ + ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + if (!ret) + ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock @@ -3906,6 +4027,7 @@ int __init cgroup_init_early(void) INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_LIST_HEAD(&init_css_set.mg_tasks); + INIT_LIST_HEAD(&init_css_set.mg_preload_node); INIT_LIST_HEAD(&init_css_set.mg_node); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; -- cgit v1.2.3 From 0e1d768f1b1873272ec4e8dc1482bb5281855017 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:03 -0500 Subject: cgroup: drop task_lock() protection around task->cgroups For optimization, task_lock() is additionally used to protect task->cgroups. The optimization is pretty dubious as either css_set_rwsem is grabbed anyway or PF_EXITING already protects task->cgroups. It adds only overhead and confusion at this point. Let's drop task_[un]lock() and update comments accordingly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 ++-- kernel/cgroup.c | 97 +++++++++++++------------------------------------- 2 files changed, 28 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4829a577c1b9..acbb9a4cb6e9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -658,10 +658,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) */ #ifdef CONFIG_PROVE_RCU extern struct mutex cgroup_mutex; +extern struct rw_semaphore css_set_rwsem; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ - lockdep_is_held(&(task)->alloc_lock) || \ - lockdep_is_held(&cgroup_mutex) || (__c)) + lockdep_is_held(&cgroup_mutex) || \ + lockdep_is_held(&css_set_rwsem) || \ + ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fa0567f4eedd..f783af900208 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -80,12 +80,21 @@ static DEFINE_MUTEX(cgroup_tree_mutex); /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. + * + * css_set_rwsem protects task->cgroups pointer, the list of css_set + * objects, and the chain of tasks off each css_set. + * + * These locks are exported if CONFIG_PROVE_RCU so that accessors in + * cgroup.h can use them for lockdep annotations. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ +DECLARE_RWSEM(css_set_rwsem); +EXPORT_SYMBOL_GPL(cgroup_mutex); +EXPORT_SYMBOL_GPL(css_set_rwsem); #else static DEFINE_MUTEX(cgroup_mutex); +static DECLARE_RWSEM(css_set_rwsem); #endif /* @@ -338,12 +347,6 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; - -/* - * css_set_rwsem protects the list of css_set objects, and the chain of - * tasks off each css_set. - */ -static DECLARE_RWSEM(css_set_rwsem); static int css_set_count; /* @@ -803,10 +806,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } /* - * There is one global cgroup mutex. We also require taking - * task_lock() when dereferencing a task's cgroup subsys pointers. - * See "The task_lock() exception", at the end of this comment. - * * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. @@ -836,18 +835,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that top_cgroup cannot be deleted. * - * The task_lock() exception - * - * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one task's cgroup pointer with - * another. It does so using cgroup_mutex, however there are - * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global - * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task's cgroup pointer we use - * task_lock(), which acts on a spinlock (task->alloc_lock) already in - * the task_struct routinely used for such matters. - * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ @@ -1329,8 +1316,6 @@ static void cgroup_enable_task_cg_lists(void) */ read_lock(&tasklist_lock); do_each_thread(g, p) { - task_lock(p); - WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); @@ -1349,8 +1334,6 @@ static void cgroup_enable_task_cg_lists(void) get_css_set(cset); } spin_unlock_irq(&p->sighand->siglock); - - task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: @@ -1743,11 +1726,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, old_cset = task_css_set(tsk); get_css_set(new_cset); - - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, new_cset); - task_unlock(tsk); - list_move(&tsk->cg_list, &new_cset->mg_tasks); /* @@ -1999,8 +1978,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of @tsk or each thread in the threadgroup individually in turn. + * Call holding cgroup_mutex and threadgroup_lock of @leader. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2034,7 +2012,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup; may take task_lock of task. + * cgroup_mutex and threadgroup. */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { @@ -4155,12 +4133,6 @@ core_initcall(cgroup_wq_init); * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc//cgroup. - * - No need to task_lock(tsk) on this tsk->cgroup reference, as it - * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it - * anyway. No need to check that tsk->cgroup != NULL, thanks to - * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks - * cgroup to top_cgroup. */ /* TODO: Use a proper seq_file iterator */ @@ -4310,15 +4282,12 @@ void cgroup_post_fork(struct task_struct *child) struct css_set *cset; down_write(&css_set_rwsem); - cset = task_css_set_check(current, - lockdep_is_held(&css_set_rwsem)); - task_lock(child); + cset = task_css_set(current); if (list_empty(&child->cg_list)) { rcu_assign_pointer(child->cgroups, cset); list_add(&child->cg_list, &cset->tasks); get_css_set(cset); } - task_unlock(child); up_write(&css_set_rwsem); } @@ -4347,27 +4316,13 @@ void cgroup_post_fork(struct task_struct *child) * use notify_on_release cgroups where very high task exit scaling * is required on large systems. * - * the_top_cgroup_hack: - * - * Set the exiting tasks cgroup to the root cgroup (top_cgroup). - * - * We call cgroup_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to the - * root cgroup in each hierarchy for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cgroup, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cgroup function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cgroup reference count, to no avail. - * - * Normally, holding a reference to a cgroup without bumping its - * count is unsafe. The cgroup could go away, or someone could - * attach us to a different cgroup, decrementing the count on - * the first cgroup that we never incremented. But in this case, - * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any cgroup_attach_task() attempts, or task is a failed - * fork, never visible to cgroup_attach_task. + * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We + * call cgroup_exit() while the task is still competent to handle + * notify_on_release(), then leave the task attached to the root cgroup in + * each hierarchy for the remainder of its exit. No need to bother with + * init_css_set refcnting. init_css_set never goes away and we can't race + * with migration path - either PF_EXITING is visible to migration path or + * @tsk never got on the tasklist. */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -4377,20 +4332,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) int i; /* - * Unlink from the css_set task list if necessary. Optimistically - * check cg_list before taking css_set_rwsem. + * Unlink from @tsk from its css_set. As migration path can't race + * with us, we can check cg_list without grabbing css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { down_write(&css_set_rwsem); - if (!list_empty(&tsk->cg_list)) { - list_del_init(&tsk->cg_list); - put_cset = true; - } + list_del_init(&tsk->cg_list); up_write(&css_set_rwsem); + put_cset = true; } /* Reassign the task to the init_css_set. */ - task_lock(tsk); cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); @@ -4405,7 +4357,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } } } - task_unlock(tsk); if (put_cset) put_css_set(cset, true); -- cgit v1.2.3 From fdce6bf8c5b6968eb9b96ecc5fe400514a604902 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: remove NULL checks from [pr_cont_]cgroup_{name|path}() The dummy hierarchy is now a fully functional one and dummy_top has a kernfs_node associated with it. Drop the NULL checks in [pr_cont_]cont_{name|path}() which are no longer necessary. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index acbb9a4cb6e9..9f4f253f0e47 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -508,39 +508,23 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq); static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { - /* dummy_top doesn't have a kn associated */ - if (cgrp->kn) - return kernfs_name(cgrp->kn, buf, buflen); - else - return strlcpy(buf, "/", buflen); + return kernfs_name(cgrp->kn, buf, buflen); } static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { - /* dummy_top doesn't have a kn associated */ - if (cgrp->kn) - return kernfs_path(cgrp->kn, buf, buflen); - strlcpy(buf, "/", buflen); - return (buflen <= 2) ? NULL : buf; + return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { - /* dummy_top doesn't have a kn associated */ - if (cgrp->kn) - pr_cont_kernfs_name(cgrp->kn); - else - pr_cont("/"); + pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { - /* dummy_top doesn't have a kn associated */ - if (cgrp->kn) - pr_cont_kernfs_path(cgrp->kn); - else - pr_cont("/"); + pr_cont_kernfs_path(cgrp->kn); } char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); -- cgit v1.2.3 From 944196278d3dea0cece1636de417b56897d9a108 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: move ->subsys_mask from cgroupfs_root to cgroup cgroupfs_root->subsys_mask represents the controllers attached to the hierarchy. This patch moves the field to cgroup. Subsystem initialization and rebinding updates the top cgroup's subsys_mask. For !root cgroups, the subsys_mask bits are set from create_css() and cleared from kill_css(), which effectively means that all cgroups will have the same subsys_mask as the top cgroup. While this doesn't make any difference now, this will help implementation of the default unified hierarchy where !root cgroups may have subsets of the top_cgroup's subsys_mask. While at it, __kill_css() is split out of kill_css(). The former doesn't care about the subsys_mask while the latter becomes noop if the controller is already killed and clears the matching bit if not before proceeding to killing the css. This will be used later by the default unified hierarchy implementation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 ++--- kernel/cgroup.c | 61 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 42 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9f4f253f0e47..3752a0182c94 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -173,6 +173,9 @@ struct cgroup { */ u64 serial_nr; + /* The bitmask of subsystems attached to this cgroup */ + unsigned long subsys_mask; + /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -276,9 +279,6 @@ enum { struct cgroupfs_root { struct kernfs_root *kf_root; - /* The bitmask of subsystems attached to this hierarchy */ - unsigned long subsys_mask; - /* Unique id for this hierarchy. */ int hierarchy_id; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b632981bd9c7..807f88dbda51 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -526,7 +526,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->subsys_mask & (1UL << i)) { + if (root->top_cgroup.subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -739,7 +739,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgroup_dummy_root, root->subsys_mask); + rebind_subsystems(&cgroup_dummy_root, cgrp->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -1038,8 +1038,8 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root, ss->root = dst_root; css->cgroup = dst_top; - src_root->subsys_mask &= ~(1 << ssid); - dst_root->subsys_mask |= 1 << ssid; + src_top->subsys_mask &= ~(1 << ssid); + dst_top->subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1058,7 +1058,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) + if (root->top_cgroup.subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1262,12 +1262,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->top_cgroup.subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->top_cgroup.subsys_mask; + removed_mask = root->top_cgroup.subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1515,7 +1515,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + (opts.subsys_mask != root->top_cgroup.subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -3594,6 +3594,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) cgroup_get(cgrp); css_get(css->parent); + cgrp->subsys_mask |= 1 << ss->id; + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -3700,7 +3702,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->subsys_mask & (1 << ssid)) { + if (root->top_cgroup.subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; @@ -3788,17 +3790,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) +static void __kill_css(struct cgroup_subsys_state *css) { + lockdep_assert_held(&cgroup_tree_mutex); + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. @@ -3824,6 +3819,28 @@ static void kill_css(struct cgroup_subsys_state *css) percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + struct cgroup *cgrp = css->cgroup; + + lockdep_assert_held(&cgroup_tree_mutex); + + /* if already killed, noop */ + if (cgrp->subsys_mask & (1 << css->ss->id)) { + cgrp->subsys_mask &= ~(1 << css->ss->id); + __kill_css(css); + } +} + /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4036,7 +4053,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgroup_dummy_root.subsys_mask |= 1 << ss->id; + cgroup_dummy_root.top_cgroup.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4192,7 +4209,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) + if (root->top_cgroup.subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit v1.2.3 From 3dd06ffa9df99aa88f4e01eaa0c9d3cb362430b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: rename cgroup_dummy_root and related names The dummy root will be repurposed to serve as the default unified hierarchy. Let's rename things in preparation. * s/cgroup_dummy_root/cgrp_dfl_root/ * s/cgroupfs_root/cgroup_root/ as we don't do fs part directly anymore * s/cgroup_root->top_cgroup/cgroup_root->cgrp/ for brevity This is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 14 ++--- kernel/cgroup.c | 168 ++++++++++++++++++++++++------------------------- 2 files changed, 88 insertions(+), 94 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3752a0182c94..77294fc66603 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -24,7 +24,7 @@ #ifdef CONFIG_CGROUPS -struct cgroupfs_root; +struct cgroup_root; struct cgroup_subsys; struct inode; struct cgroup; @@ -179,7 +179,7 @@ struct cgroup { /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; - struct cgroupfs_root *root; + struct cgroup_root *root; /* * List of cgrp_cset_links pointing at css_sets with tasks in this @@ -211,7 +211,7 @@ struct cgroup { #define MAX_CGROUP_ROOT_NAMELEN 64 -/* cgroupfs_root->flags */ +/* cgroup_root->flags */ enum { /* * Unfortunately, cgroup core and various controllers are riddled @@ -272,18 +272,18 @@ enum { }; /* - * A cgroupfs_root represents the root of a cgroup hierarchy, and may be + * A cgroup_root represents the root of a cgroup hierarchy, and may be * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ -struct cgroupfs_root { +struct cgroup_root { struct kernfs_root *kf_root; /* Unique id for this hierarchy. */ int hierarchy_id; /* The root cgroup. Root is destroyed on its release. */ - struct cgroup top_cgroup; + struct cgroup cgrp; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -598,7 +598,7 @@ struct cgroup_subsys { const char *name; /* link to parent, protected by cgroup_lock() */ - struct cgroupfs_root *root; + struct cgroup_root *root; /* * List of cftypes. Each entry is the first entry of an array diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 807f88dbda51..60ea16058c42 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,14 +138,11 @@ static const char *cgroup_subsys_name[] = { #undef SUBSYS /* - * The dummy hierarchy, reserved for the subsystems that are otherwise + * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -static struct cgroupfs_root cgroup_dummy_root; - -/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ -static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; +static struct cgroup_root cgrp_dfl_root; /* The list of hierarchy roots */ @@ -175,7 +172,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); -static int rebind_subsystems(struct cgroupfs_root *dst_root, +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -514,7 +511,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state *template[]) { - struct cgroupfs_root *root = cgrp->root; + struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; @@ -526,7 +523,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->top_cgroup.subsys_mask & (1UL << i)) { + if (root->cgrp.subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -685,14 +682,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *top_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kf_root->kn->priv; - return top_cgrp->root; + return root_cgrp->root; } -static int cgroup_init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -706,7 +703,7 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) return 0; } -static void cgroup_exit_root_id(struct cgroupfs_root *root) +static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); @@ -716,7 +713,7 @@ static void cgroup_exit_root_id(struct cgroupfs_root *root) } } -static void cgroup_free_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroup_root *root) { if (root) { /* hierarhcy ID shoulid already have been released */ @@ -727,9 +724,9 @@ static void cgroup_free_root(struct cgroupfs_root *root) } } -static void cgroup_destroy_root(struct cgroupfs_root *root) +static void cgroup_destroy_root(struct cgroup_root *root) { - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; mutex_lock(&cgroup_tree_mutex); @@ -739,7 +736,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgroup_dummy_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -770,7 +767,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroupfs_root *root) + struct cgroup_root *root) { struct cgroup *res = NULL; @@ -778,7 +775,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, lockdep_assert_held(&css_set_rwsem); if (cset == &init_css_set) { - res = &root->top_cgroup; + res = &root->cgrp; } else { struct cgrp_cset_link *link; @@ -801,7 +798,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * called with cgroup_mutex and css_set_rwsem held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroupfs_root *root) + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -837,9 +834,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cgroup + * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't - * need a special hack to ensure that top_cgroup cannot be deleted. + * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() @@ -905,7 +902,7 @@ static void cgroup_free_fn(struct work_struct *work) kfree(cgrp); } else { /* - * This is top cgroup's refcnt reaching zero, which + * This is root cgroup's refcnt reaching zero, which * indicates that the root should be released. */ cgroup_destroy_root(cgrp->root); @@ -976,10 +973,9 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) } } -static int rebind_subsystems(struct cgroupfs_root *dst_root, +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { - struct cgroup *dst_top = &dst_root->top_cgroup; struct cgroup_subsys *ss; int ssid, ret; @@ -991,20 +987,20 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root, continue; /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgroup_dummy_root) + if (ss->root == &cgrp_dfl_root) continue; /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->top_cgroup.children)) + if (!list_empty(&ss->root->cgrp.children)) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgroup_dummy_root) + if (dst_root != &cgrp_dfl_root) return -EBUSY; } - if (dst_root != &cgroup_dummy_root) { - ret = cgroup_populate_dir(dst_top, ss_mask); + if (dst_root != &cgrp_dfl_root) { + ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); if (ret) return ret; } @@ -1015,50 +1011,48 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root, */ mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) - if ((ss_mask & (1 << ssid)) && ss->root != &cgroup_dummy_root) - cgroup_clear_dir(&ss->root->top_cgroup, 1 << ssid); + if ((ss_mask & (1 << ssid)) && ss->root != &cgrp_dfl_root) + cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { - struct cgroupfs_root *src_root; - struct cgroup *src_top; + struct cgroup_root *src_root; struct cgroup_subsys_state *css; if (!(ss_mask & (1 << ssid))) continue; src_root = ss->root; - src_top = &src_root->top_cgroup; - css = cgroup_css(src_top, ss); + css = cgroup_css(&src_root->cgrp, ss); - WARN_ON(!css || cgroup_css(dst_top, ss)); + WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); - RCU_INIT_POINTER(src_top->subsys[ssid], NULL); - rcu_assign_pointer(dst_top->subsys[ssid], css); + RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); + rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); ss->root = dst_root; - css->cgroup = dst_top; + css->cgroup = &dst_root->cgrp; - src_top->subsys_mask &= ~(1 << ssid); - dst_top->subsys_mask |= 1 << ssid; + src_root->cgrp.subsys_mask &= ~(1 << ssid); + dst_root->cgrp.subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); } - if (dst_root != &cgroup_dummy_root) - kernfs_activate(dst_top->kn); + if (dst_root != &cgrp_dfl_root) + kernfs_activate(dst_root->cgrp.kn); return 0; } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { - struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) - if (root->top_cgroup.subsys_mask & (1 << ssid)) + if (root->cgrp.subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1072,7 +1066,7 @@ static int cgroup_show_options(struct seq_file *seq, seq_printf(seq, ",release_agent=%s", root->release_agent_path); spin_unlock(&release_agent_path_lock); - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); @@ -1245,7 +1239,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1262,12 +1256,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->top_cgroup.subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->top_cgroup.subsys_mask; - removed_mask = root->top_cgroup.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; + removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1280,7 +1274,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->top_cgroup.children)) { + if (!list_empty(&root->cgrp.children)) { ret = -EBUSY; goto out_unlock; } @@ -1289,7 +1283,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgroup_dummy_root, removed_mask); + rebind_subsystems(&cgrp_dfl_root, removed_mask); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1368,10 +1362,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dummy_css.cgroup = cgrp; } -static void init_cgroup_root(struct cgroupfs_root *root, +static void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); @@ -1385,13 +1379,13 @@ static void init_cgroup_root(struct cgroupfs_root *root, if (opts->name) strcpy(root->name, opts->name); if (opts->cpuset_clone_children) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); - struct cgroup *root_cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->cgrp; struct css_set *cset; int i, ret; @@ -1443,7 +1437,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) cgroup_root_count++; /* - * Link the top cgroup in this hierarchy into all the css_set + * Link the root cgroup in this hierarchy into all the css_set * objects. */ down_write(&css_set_rwsem); @@ -1472,7 +1466,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - struct cgroupfs_root *root; + struct cgroup_root *root; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; @@ -1496,7 +1490,7 @@ retry: for_each_root(root) { bool name_match = false; - if (root == &cgroup_dummy_root) + if (root == &cgrp_dfl_root) continue; /* @@ -1515,7 +1509,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->top_cgroup.subsys_mask)) { + (opts.subsys_mask != root->cgrp.subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -1533,13 +1527,13 @@ retry: } /* - * A root's lifetime is governed by its top cgroup. Zero + * A root's lifetime is governed by its root cgroup. Zero * ref indicate that the root is being destroyed. Wait for * destruction to complete so that the subsystems are free. * We can use wait_queue for the wait but this path is * super cold. Let's just sleep for a bit and retry. */ - if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { + if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); kfree(opts.release_agent); @@ -1586,16 +1580,16 @@ out_unlock: dentry = kernfs_mount(fs_type, flags, root->kf_root); if (IS_ERR(dentry)) - cgroup_put(&root->top_cgroup); + cgroup_put(&root->cgrp); return dentry; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); - struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->top_cgroup); + cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -1622,7 +1616,7 @@ static struct kobject *cgroup_kobj; */ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { - struct cgroupfs_root *root; + struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; char *path = NULL; @@ -2112,14 +2106,14 @@ out_unlock_cgroup: */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { - struct cgroupfs_root *root; + struct cgroup_root *root; int retval = 0; mutex_lock(&cgroup_mutex); for_each_root(root) { struct cgroup *from_cgrp; - if (root == &cgroup_dummy_root) + if (root == &cgrp_dfl_root) continue; down_read(&css_set_rwsem); @@ -2151,7 +2145,7 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, static int cgroup_release_agent_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroupfs_root *root = css->cgroup->root; + struct cgroup_root *root = css->cgroup->root; BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) @@ -2362,14 +2356,14 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *root = &ss->root->top_cgroup; + struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_tree_mutex); /* don't bother if @ss isn't attached */ - if (ss->root == &cgroup_dummy_root) + if (ss->root == &cgrp_dfl_root) return 0; /* add/rm files for all cgroups created before */ @@ -3623,7 +3617,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup *cgrp; - struct cgroupfs_root *root = parent->root; + struct cgroup_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; struct kernfs_node *kn; @@ -3702,7 +3696,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->top_cgroup.subsys_mask & (1 << ssid)) { + if (root->cgrp.subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; @@ -4031,17 +4025,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) INIT_LIST_HEAD(&ss->cfts); - /* Create the top cgroup state for this subsystem */ - ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); + /* Create the root cgroup state for this subsystem */ + ss->root = &cgrp_dfl_root; + css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, cgroup_dummy_top); + init_css(css, ss, &cgrp_dfl_root.cgrp); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the - * init_css_set is in the subsystem's top cgroup. */ + * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4053,7 +4047,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgroup_dummy_root.top_cgroup.subsys_mask |= 1 << ss->id; + cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4071,7 +4065,7 @@ int __init cgroup_init_early(void) struct cgroup_subsys *ss; int i; - init_cgroup_root(&cgroup_dummy_root, &opts); + init_cgroup_root(&cgrp_dfl_root, &opts); RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4112,7 +4106,7 @@ int __init cgroup_init(void) key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(cgroup_setup_root(&cgroup_dummy_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4181,7 +4175,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) struct task_struct *tsk; char *buf, *path; int retval; - struct cgroupfs_root *root; + struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -4204,12 +4198,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgroup_dummy_root) + if (root == &cgrp_dfl_root) continue; seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->top_cgroup.subsys_mask & (1 << ssid)) + if (root->cgrp.subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", @@ -4639,7 +4633,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) struct cgroup *c = link->cgrp; const char *name = "?"; - if (c != cgroup_dummy_top) { + if (c != &cgrp_dfl_root.cgrp) { cgroup_name(c, name_buf, NAME_MAX + 1); name = name_buf; } -- cgit v1.2.3 From 4d3bb511b5f9980fc3e9ae5939ebc475b231d3fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: drop const from @buffer of cftype->write_string() cftype->write_string() just passes on the writeable buffer from kernfs and there's no reason to add const restriction on the buffer. The only thing const achieves is unnecessarily complicating parsing of the buffer. Drop const from @buffer. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- block/blk-throttle.c | 4 ++-- block/cfq-iosched.c | 4 ++-- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 2 +- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 2 +- mm/hugetlb_cgroup.c | 2 +- mm/memcontrol.c | 4 ++-- net/core/netprio_cgroup.c | 2 +- net/ipv4/tcp_memcontrol.c | 2 +- security/device_cgroup.c | 4 ++-- 11 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 861c363e4129..033745cd7fba 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1408,13 +1408,13 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, } static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buf) + char *buf) { return tg_set_conf(css, cft, buf, true); } static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buf) + char *buf) { return tg_set_conf(css, cft, buf, false); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 461187943392..f5de45b6af36 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1701,13 +1701,13 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, } static int cfqg_set_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf) + struct cftype *cft, char *buf) { return __cfqg_set_weight_device(css, cft, buf, false); } static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf) + struct cftype *cft, char *buf) { return __cfqg_set_weight_device(css, cft, buf, true); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 77294fc66603..79993ac066c5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -454,7 +454,7 @@ struct cftype { * Returns 0 or -ve error code. */ int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer); + char *buffer); /* * trigger() callback can be used to get some kick from the * userspace, when the actual string written is not important diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 60ea16058c42..f5754910e80b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2143,7 +2143,7 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, } static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { struct cgroup_root *root = css->cgroup->root; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2ea98b216bff..2bc4a2256444 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -442,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { bool freeze; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8d5324583aa4..efbf9baf77ec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1610,7 +1610,7 @@ out_unlock: * Common handling for a write to a "cpus" or "mems" file. */ static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf) + struct cftype *cft, char *buf) { struct cpuset *cs = css_cs(css); struct cpuset *trialcs; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b135853e68f3..595d7fd795e1 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -254,7 +254,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, } static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { int idx, name, ret; unsigned long long val; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9c6ac1532e6..96f94a9f2faf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5242,7 +5242,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, * RES_LIMIT. */ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); enum res_type type; @@ -6063,7 +6063,7 @@ static void memcg_event_ptable_queue_proc(struct file *file, * Interpretation of args is defined by control file implementation. */ static int memcg_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f9f3a40d3350..3825f669147b 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -186,7 +186,7 @@ static int read_priomap(struct seq_file *sf, void *v) } static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { char devname[IFNAMSIZ + 1]; struct net_device *dev; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 20a0aca9131e..d4f015ad6c84 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) } static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long long val; diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 7f88bcde7c61..8365909f5f8c 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -496,7 +496,7 @@ static inline bool has_children(struct dev_cgroup *devcgroup) * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, - int filetype, const char *buffer) + int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ @@ -652,7 +652,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, } static int devcgroup_access_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { int retval; -- cgit v1.2.3 From a2dd424750807f83632a2a70293961086fd8f870 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:55 -0400 Subject: cgroup: make cgrp_dfl_root mountable cgrp_dfl_root will be used as the default unified hierarchy. This patch makes cgrp_dfl_root mountable by making the following changes. * cgroup_init_early() now initializes cgrp_dfl_root w/ CGRP_ROOT_SANE_BEHAVIOR. The default hierarchy is always sane. * parse_cgroupfs_options() and cgroup_mount() are updated such that cgrp_dfl_root is mounted if sane_behavior is specified w/o any subsystems. * rebind_subsystems() now populates the root directory of cgrp_dfl_root. Note that the function still guarantees success of rebinding subsystems to cgrp_dfl_root. If populating fails while rebinding to cgrp_dfl_root, it whines but ignores the error. * For backward compatibility, the default hierarchy shows up in /proc/$PID/cgroup only after it's explicitly mounted so that userland which doesn't make use of it doesn't see any change. * "current_css_set_cg_links" file of debug cgroup now treats the default hierarchy the same as other hierarchies. This is visible to userland. Given that it's for debug controller, this should be fine. * While at it, implement cgroup_on_dfl() which tests whether a give cgroup is on the default hierarchy or not. The above changes make cgrp_dfl_root mostly equivalent to other controllers but the actual unified hierarchy behaviors are not implemented yet. Let's plug child cgroup creation in cgrp_dfl_root from create_cgroup() for now. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 10 ++++++ kernel/cgroup.c | 94 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 71 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 79993ac066c5..7e9fa505b7bb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -250,6 +250,9 @@ enum { * * - "cgroup.clone_children" is removed. * + * - If mount is requested with sane_behavior but without any + * subsystem, the default unified hierarchy is mounted. + * * - cpuset: tasks will be kept in empty cpusets when hotplug happens * and take masks of ancestors with non-empty cpus/mems, instead of * being moved to an ancestor. @@ -468,6 +471,13 @@ struct cftype { #endif }; +extern struct cgroup_root cgrp_dfl_root; + +static inline bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} + /* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f5754910e80b..69b4939e9f6d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -142,7 +142,13 @@ static const char *cgroup_subsys_name[] = { * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -static struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root; + +/* + * The default hierarchy always exists but is hidden until mounted for the + * first time. This is for backward compatibility. + */ +static bool cgrp_dfl_root_visible; /* The list of hierarchy roots */ @@ -999,10 +1005,22 @@ static int rebind_subsystems(struct cgroup_root *dst_root, return -EBUSY; } - if (dst_root != &cgrp_dfl_root) { - ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); - if (ret) + ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + if (ret) { + if (dst_root != &cgrp_dfl_root) return ret; + + /* + * Rebinding back to the default root is not allowed to + * fail. Using both default and non-default roots should + * be rare. Moving subsystems back and forth even more so. + * Just warn about it and continue. + */ + if (cgrp_dfl_root_visible) { + pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + } } /* @@ -1011,7 +1029,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, */ mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) - if ((ss_mask & (1 << ssid)) && ss->root != &cgrp_dfl_root) + if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); mutex_lock(&cgroup_mutex); @@ -1039,8 +1057,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->bind(css); } - if (dst_root != &cgrp_dfl_root) - kernfs_activate(dst_root->cgrp.kn); + kernfs_activate(dst_root->cgrp.kn); return 0; } @@ -1190,16 +1207,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options - * were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - set_bit(i, &opts->subsys_mask); - /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { @@ -1211,6 +1218,23 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } + } else { + /* + * If the 'all' option was specified select all the + * subsystems, otherwise if 'none', 'name=' and a subsystem + * name options were not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + set_bit(i, &opts->subsys_mask); + + /* + * We either have to specify by name or by subsystems. (So + * all empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; } /* @@ -1226,13 +1250,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->subsys_mask && opts->none) return -EINVAL; - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - return 0; } @@ -1487,6 +1504,14 @@ retry: goto out_unlock; /* look for a matching existing root */ + if (!opts.subsys_mask && !opts.none && !opts.name) { + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + ret = 0; + goto out_unlock; + } + for_each_root(root) { bool name_match = false; @@ -3622,6 +3647,13 @@ static long cgroup_create(struct cgroup *parent, const char *name, struct cgroup_subsys *ss; struct kernfs_node *kn; + /* + * XXX: The default hierarchy isn't fully implemented yet. Block + * !root cgroup creation on it for now. + */ + if (root == &cgrp_dfl_root) + return -EINVAL; + /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) @@ -4061,7 +4093,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts = { }; + static struct cgroup_sb_opts __initdata opts = + { .flags = CGRP_ROOT_SANE_BEHAVIOR }; struct cgroup_subsys *ss; int i; @@ -4198,7 +4231,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root) + if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -4631,15 +4664,10 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name = "?"; - - if (c != &cgrp_dfl_root.cgrp) { - cgroup_name(c, name_buf, NAME_MAX + 1); - name = name_buf; - } + cgroup_name(c, name_buf, NAME_MAX + 1); seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name); + c->root->hierarchy_id, name_buf); } rcu_read_unlock(); up_read(&css_set_rwsem); -- cgit v1.2.3 From 8cbbf2c972c4444cad36f61cd571714c39b8cf04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:55 -0400 Subject: cgroup: implement CFTYPE_ONLY_ON_DFL This cftype flag makes the file only appear on the default hierarchy. This will later be used for cgroup.controllers file. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7e9fa505b7bb..43d1ed30bae3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -384,6 +384,7 @@ enum { CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ + CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */ }; #define MAX_CFTYPE_NAME 64 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 69b4939e9f6d..37b6d534b0ca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2356,6 +2356,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) -- cgit v1.2.3 From 1ec41830e087cda1f62dda4182c2b62811eb0ffc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 28 Mar 2014 15:22:19 +0800 Subject: cgroup: remove useless argument from cgroup_exit() Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 5 ++--- kernel/exit.c | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 43d1ed30bae3..c2515851c1aa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -33,7 +33,7 @@ extern int cgroup_init_early(void); extern int cgroup_init(void); extern void cgroup_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); -extern void cgroup_exit(struct task_struct *p, int run_callbacks); +extern void cgroup_exit(struct task_struct *p); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); @@ -843,7 +843,7 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} -static inline void cgroup_exit(struct task_struct *p, int callbacks) {} +static inline void cgroup_exit(struct task_struct *p) {} static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 60fd6f1f6d4e..f7f94322d312 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4416,7 +4416,6 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process - * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -4433,7 +4432,7 @@ void cgroup_post_fork(struct task_struct *child) * init_css_set refcnting. init_css_set never goes away and we can't race * with migration path - PF_EXITING is visible to migration path. */ -void cgroup_exit(struct task_struct *tsk, int run_callbacks) +void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; @@ -4455,7 +4454,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - if (run_callbacks && need_forkexit_callback) { + if (need_forkexit_callback) { /* see cgroup_post_fork() for details */ for_each_subsys(ss, i) { if (ss->exit) { diff --git a/kernel/exit.c b/kernel/exit.c index 1e77fc645317..6480d1c85d7a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -797,7 +797,7 @@ void do_exit(long code) */ perf_event_exit_task(tsk); - cgroup_exit(tsk, 1); + cgroup_exit(tsk); if (group_dead) disassociate_ctty(1); -- cgit v1.2.3