From dbf22eb6d8675fc173154d9f1bd1bd0fda53a001 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:41 -0800 Subject: memcg: zap __memcg_{charge,uncharge}_slab They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's zap them and call these functions directly. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7c95af8d552c..18ccb2988979 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); int __memcg_cleanup_cache_params(struct kmem_cache *s); -- cgit v1.2.3 From d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:47 -0800 Subject: memcg: zap memcg_slab_caches and memcg_slab_mutex mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to the given cgroup. Currently, it is only used on css free in order to destroy all caches corresponding to the memory cgroup being freed. The list is protected by memcg_slab_mutex. The mutex is also used to protect kmem_cache->memcg_params->memcg_caches arrays and synchronizes kmem_cache_destroy vs memcg_unregister_all_caches. However, we can perfectly get on without these two. To destroy all caches corresponding to a memory cgroup, we can walk over the global list of kmem caches, slab_caches, and we can do all the synchronization stuff using the slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid of the memcg_slab_caches and memcg_slab_mutex. Apart from this nice cleanup, it also: - assures that rcu_barrier() is called once at max when a root cache is destroyed or a memory cgroup is freed, no matter how many caches have SLAB_DESTROY_BY_RCU flag set; - fixes the race between kmem_cache_destroy and kmem_cache_create that exists, because memcg_cleanup_cache_params, which is called from kmem_cache_destroy after checking that kmem_cache->refcount=0, releases the slab_mutex, which gives kmem_cache_create a chance to make an alias to a cache doomed to be destroyed. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 - include/linux/slab.h | 6 +- mm/memcontrol.c | 156 +++++---------------------------------------- mm/slab_common.c | 142 +++++++++++++++++++++++++++++------------ 4 files changed, 120 insertions(+), 186 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18ccb2988979..fb212e1d700d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -int __memcg_cleanup_cache_params(struct kmem_cache *s); - /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index eca9ed303a1b..2e3b448cfa2d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *); +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to - * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { @@ -502,7 +501,6 @@ struct memcg_cache_params { }; struct { struct mem_cgroup *memcg; - struct list_head list; struct kmem_cache *root_cache; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index baf7eb27e3ae..f3f8a4f52a0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -343,9 +343,6 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list, but per-memcg; - * protected by memcg_slab_mutex */ - struct list_head memcg_slab_caches; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2476,25 +2473,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -/* - * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or - * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. - */ -static DEFINE_MUTEX(memcg_slab_mutex); - -/* - * This is a bit cumbersome, but it is rarely used and avoids a backpointer - * in the memcg_cache_params struct. - */ -static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) -{ - struct kmem_cache *cachep; - - VM_BUG_ON(p->is_root_cache); - cachep = p->root_cache; - return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); -} - int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages) { @@ -2578,10 +2556,7 @@ static int memcg_alloc_cache_id(void) else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(size); - mutex_unlock(&memcg_slab_mutex); - if (err) { ida_simple_remove(&kmem_limited_groups, id); return err; @@ -2604,120 +2579,20 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = num; } -static void memcg_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - struct kmem_cache *cachep; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - id = memcg_cache_id(memcg); - - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, id)) - return; - - cachep = memcg_create_kmem_cache(memcg, root_cache); - /* - * If we could not create a memcg cache, do not complain, because - * that's not critical at all as we can always proceed with the root - * cache. - */ - if (!cachep) - return; - - list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); - - /* - * Since readers won't lock (see cache_from_memcg_idx()), we need a - * barrier here to ensure nobody will see the kmem_cache partially - * initialized. - */ - smp_wmb(); - - BUG_ON(root_cache->memcg_params->memcg_caches[id]); - root_cache->memcg_params->memcg_caches[id] = cachep; -} - -static void memcg_unregister_cache(struct kmem_cache *cachep) -{ - struct kmem_cache *root_cache; - struct mem_cgroup *memcg; - int id; - - lockdep_assert_held(&memcg_slab_mutex); - - BUG_ON(is_root_cache(cachep)); - - root_cache = cachep->memcg_params->root_cache; - memcg = cachep->memcg_params->memcg; - id = memcg_cache_id(memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); - root_cache->memcg_params->memcg_caches[id] = NULL; - - list_del(&cachep->memcg_params->list); - - kmem_cache_destroy(cachep); -} - -int __memcg_cleanup_cache_params(struct kmem_cache *s) -{ - struct kmem_cache *c; - int i, failed = 0; - - mutex_lock(&memcg_slab_mutex); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - - memcg_unregister_cache(c); - - if (cache_from_memcg_idx(s, i)) - failed++; - } - mutex_unlock(&memcg_slab_mutex); - return failed; -} - -static void memcg_unregister_all_caches(struct mem_cgroup *memcg) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *params, *tmp; - - if (!memcg_kmem_is_active(memcg)) - return; - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { - cachep = memcg_params_to_cache(params); - memcg_unregister_cache(cachep); - } - mutex_unlock(&memcg_slab_mutex); -} - -struct memcg_register_cache_work { +struct memcg_kmem_cache_create_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_register_cache_func(struct work_struct *w) +static void memcg_kmem_cache_create_func(struct work_struct *w) { - struct memcg_register_cache_work *cw = - container_of(w, struct memcg_register_cache_work, work); + struct memcg_kmem_cache_create_work *cw = + container_of(w, struct memcg_kmem_cache_create_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - mutex_lock(&memcg_slab_mutex); - memcg_register_cache(memcg, cachep); - mutex_unlock(&memcg_slab_mutex); + memcg_create_kmem_cache(memcg, cachep); css_put(&memcg->css); kfree(cw); @@ -2726,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct memcg_register_cache_work *cw; + struct memcg_kmem_cache_create_work *cw; cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (!cw) @@ -2739,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; + INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_schedule_register_cache(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_register_cache will recurse. + * in __memcg_schedule_kmem_cache_create will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -2759,7 +2634,7 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ current->memcg_kmem_skip_account = 1; - __memcg_schedule_register_cache(memcg, cachep); + __memcg_schedule_kmem_cache_create(memcg, cachep); current->memcg_kmem_skip_account = 0; } @@ -2807,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) * could happen with the slab_mutex held. So it's better to * defer everything. */ - memcg_schedule_register_cache(memcg, cachep); + memcg_schedule_kmem_cache_create(memcg, cachep); out: css_put(&memcg->css); return cachep; @@ -4136,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { - memcg_unregister_all_caches(memcg); + memcg_destroy_kmem_caches(memcg); mem_cgroup_sockets_destroy(memcg); } #else @@ -4664,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&memcg->event_list_lock); #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->memcg_slab_caches); #endif return &memcg->css; diff --git a/mm/slab_common.c b/mm/slab_common.c index 1b782a2d3b3d..6e1e4cf65836 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -425,6 +425,49 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); +static int do_kmem_cache_shutdown(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + return -EBUSY; + } + + if (s->flags & SLAB_DESTROY_BY_RCU) + *need_rcu_barrier = true; + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) { + struct kmem_cache *root_cache = s->memcg_params->root_cache; + int memcg_id = memcg_cache_id(s->memcg_params->memcg); + + BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); + root_cache->memcg_params->memcg_caches[memcg_id] = NULL; + } +#endif + list_move(&s->list, release); + return 0; +} + +static void do_kmem_cache_release(struct list_head *release, + bool need_rcu_barrier) +{ + struct kmem_cache *s, *s2; + + if (need_rcu_barrier) + rcu_barrier(); + + list_for_each_entry_safe(s, s2, release, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + } +} + #ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. @@ -435,10 +478,11 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +void memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ + int memcg_id = memcg_cache_id(memcg); struct kmem_cache *s = NULL; char *cache_name; @@ -447,6 +491,14 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_id)) + goto out_unlock; + cgroup_name(mem_cgroup_css(memcg)->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, @@ -458,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ if (IS_ERR(s)) { kfree(cache_name); - s = NULL; + goto out_unlock; } + /* + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. + */ + smp_wmb(); + root_cache->memcg_params->memcg_caches[memcg_id] = s; + out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - return s; } -static int memcg_cleanup_cache_params(struct kmem_cache *s) +void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - int rc; + LIST_HEAD(release); + bool need_rcu_barrier = false; + struct kmem_cache *s, *s2; - if (!s->memcg_params || - !s->memcg_params->is_root_cache) - return 0; + get_online_cpus(); + get_online_mems(); - mutex_unlock(&slab_mutex); - rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (is_root_cache(s) || s->memcg_params->memcg != memcg) + continue; + /* + * The cgroup is about to be freed and therefore has no charges + * left. Hence, all its caches must be empty by now. + */ + BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + } + mutex_unlock(&slab_mutex); - return rc; -} -#else -static int memcg_cleanup_cache_params(struct kmem_cache *s) -{ - return 0; + put_online_mems(); + put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } #endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { + memcg_free_cache_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } void kmem_cache_destroy(struct kmem_cache *s) { + int i; + LIST_HEAD(release); + bool need_rcu_barrier = false; + bool busy = false; + get_online_cpus(); get_online_mems(); @@ -510,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (memcg_cleanup_cache_params(s) != 0) - goto out_unlock; + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); - goto out_unlock; + if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + busy = true; } - list_del(&s->list); - - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); -#else - slab_kmem_cache_release(s); -#endif - goto out; + if (!busy) + do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); out_unlock: mutex_unlock(&slab_mutex); -out: + put_online_mems(); put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3 From 6de226191d12fce30331ebf024ca3ed24834f0ee Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Feb 2015 15:25:01 -0800 Subject: mm: memcontrol: track move_lock state internally The complexity of memcg page stat synchronization is currently leaking into the callsites, forcing them to keep track of the move_lock state and the IRQ flags. Simplify the API by tracking it in the memcg. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 +++----- mm/memcontrol.c | 68 ++++++++++++++++++++++++++-------------------- mm/page-writeback.c | 12 +++----- mm/rmap.c | 12 +++----- 4 files changed, 51 insertions(+), 53 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fb212e1d700d..76b4084b8d08 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -138,12 +138,10 @@ static inline bool mem_cgroup_disabled(void) return false; } -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, - unsigned long *flags); -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, - unsigned long *flags); +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page); void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val); +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg); static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) @@ -285,14 +283,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, - bool *locked, unsigned long *flags) +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) { return NULL; } -static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, - bool *locked, unsigned long *flags) +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3f8a4f52a0c..028d07c79104 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -325,9 +325,11 @@ struct mem_cgroup { /* * set > 0 if pages under this cgroup are moving to other cgroup. */ - atomic_t moving_account; + atomic_t moving_account; /* taken only while moving_account > 0 */ - spinlock_t move_lock; + spinlock_t move_lock; + struct task_struct *move_lock_task; + unsigned long move_lock_flags; /* * percpu counter. */ @@ -1977,34 +1979,33 @@ cleanup: /** * mem_cgroup_begin_page_stat - begin a page state statistics transaction * @page: page that is going to change accounted state - * @locked: &memcg->move_lock slowpath was taken - * @flags: IRQ-state flags for &memcg->move_lock * * This function must mark the beginning of an accounted page state * change to prevent double accounting when the page is concurrently * being moved to another memcg: * - * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * memcg = mem_cgroup_begin_page_stat(page); * if (TestClearPageState(page)) * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg, locked, flags); - * - * The RCU lock is held throughout the transaction. The fast path can - * get away without acquiring the memcg->move_lock (@locked is false) - * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when the page - * state that is going to change is the only thing preventing the page - * from being uncharged. E.g. end-writeback clearing PageWriteback(), - * which allows migration to go ahead and uncharge the page before the - * account transaction might be complete. + * mem_cgroup_end_page_stat(memcg); */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, - bool *locked, - unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) { struct mem_cgroup *memcg; + unsigned long flags; + /* + * The RCU lock is held throughout the transaction. The fast + * path can get away without acquiring the memcg->move_lock + * because page moving starts with an RCU grace period. + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page from being uncharged. + * E.g. end-writeback clearing PageWriteback(), which allows + * migration to go ahead and uncharge the page before the + * account transaction might be complete. + */ rcu_read_lock(); if (mem_cgroup_disabled()) @@ -2014,16 +2015,22 @@ again: if (unlikely(!memcg)) return NULL; - *locked = false; if (atomic_read(&memcg->moving_account) <= 0) return memcg; - spin_lock_irqsave(&memcg->move_lock, *flags); + spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { - spin_unlock_irqrestore(&memcg->move_lock, *flags); + spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } - *locked = true; + + /* + * When charge migration first begins, we can have locked and + * unlocked page stat updates happening concurrently. Track + * the task who has the lock for mem_cgroup_end_page_stat(). + */ + memcg->move_lock_task = current; + memcg->move_lock_flags = flags; return memcg; } @@ -2031,14 +2038,17 @@ again: /** * mem_cgroup_end_page_stat - finish a page state statistics transaction * @memcg: the memcg that was accounted against - * @locked: value received from mem_cgroup_begin_page_stat() - * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, - unsigned long *flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { - if (memcg && *locked) - spin_unlock_irqrestore(&memcg->move_lock, *flags); + if (memcg && memcg->move_lock_task == current) { + unsigned long flags = memcg->move_lock_flags; + + memcg->move_lock_task = NULL; + memcg->move_lock_flags = 0; + + spin_unlock_irqrestore(&memcg->move_lock, flags); + } rcu_read_unlock(); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4335238e33..fb71e9deca85 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2308,12 +2308,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - unsigned long memcg_flags; struct mem_cgroup *memcg; - bool locked; int ret; - memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2338,19 +2336,17 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - unsigned long memcg_flags; struct mem_cgroup *memcg; - bool locked; int ret; - memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2380,7 +2376,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 70b32498d4f2..5e3e09081164 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page) { struct mem_cgroup *memcg; - unsigned long flags; - bool locked; - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg, &locked, &flags); + mem_cgroup_end_page_stat(memcg); } static void page_remove_file_rmap(struct page *page) { struct mem_cgroup *memcg; - unsigned long flags; - bool locked; - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg, &locked, &flags); + mem_cgroup_end_page_stat(memcg); } /** -- cgit v1.2.3 From 90cbc2508827e1e15dca23361c33cc26dd2b9e99 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 11 Feb 2015 15:25:55 -0800 Subject: vmscan: force scan offline memory cgroups Since commit b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") pages charged to a memory cgroup are not reparented when the cgroup is removed. Instead, they are supposed to be reclaimed in a regular way, along with pages accounted to online memory cgroups. However, an lruvec of an offline memory cgroup will sooner or later get so small that it will be scanned only at low scan priorities (see get_scan_count()). Therefore, if there are enough reclaimable pages in big lruvecs, pages accounted to offline memory cgroups will never be scanned at all, wasting memory. Fix this by unconditionally forcing scanning dead lruvecs from kswapd. [akpm@linux-foundation.org: fix build] Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 14 ++++++++++++++ mm/vmscan.c | 8 ++++++-- 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 76b4084b8d08..353537a5981a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -102,6 +102,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); * For memory reclaim. */ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); +bool mem_cgroup_lruvec_online(struct lruvec *lruvec); int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); @@ -266,6 +267,11 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return 1; } +static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + return true; +} + static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 028d07c79104..6187ca4d5dc2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1367,6 +1367,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } +bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return true; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + memcg = mz->memcg; + + return !!(memcg->css.flags & CSS_ONLINE); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) diff --git a/mm/vmscan.c b/mm/vmscan.c index f756a202d5d5..b6dfa0081a8e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1903,8 +1903,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && !zone_reclaimable(zone)) - force_scan = true; + if (current_is_kswapd()) { + if (!zone_reclaimable(zone)) + force_scan = true; + if (!mem_cgroup_lruvec_online(lruvec)) + force_scan = true; + } if (!global_reclaim(sc)) force_scan = true; -- cgit v1.2.3 From 241994ed8649f7300667be8b13a9e04ae04e05a1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Feb 2015 15:26:06 -0800 Subject: mm: memcontrol: default hierarchy interface for memory Introduce the basic control files to account, partition, and limit memory using cgroups in default hierarchy mode. This interface versioning allows us to address fundamental design issues in the existing memory cgroup interface, further explained below. The old interface will be maintained indefinitely, but a clearer model and improved workload performance should encourage existing users to switch over to the new one eventually. The control files are thus: - memory.current shows the current consumption of the cgroup and its descendants, in bytes. - memory.low configures the lower end of the cgroup's expected memory consumption range. The kernel considers memory below that boundary to be a reserve - the minimum that the workload needs in order to make forward progress - and generally avoids reclaiming it, unless there is an imminent risk of entering an OOM situation. - memory.high configures the upper end of the cgroup's expected memory consumption range. A cgroup whose consumption grows beyond this threshold is forced into direct reclaim, to work off the excess and to throttle new allocations heavily, but is generally allowed to continue and the OOM killer is not invoked. - memory.max configures the hard maximum amount of memory that the cgroup is allowed to consume before the OOM killer is invoked. - memory.events shows event counters that indicate how often the cgroup was reclaimed while below memory.low, how often it was forced to reclaim excess beyond memory.high, how often it hit memory.max, and how often it entered OOM due to memory.max. This allows users to identify configuration problems when observing a degradation in workload performance. An overcommitted system will have an increased rate of low boundary breaches, whereas increased rates of high limit breaches, maximum hits, or even OOM situations will indicate internally overcommitted cgroups. For existing users of memory cgroups, the following deviations from the current interface are worth pointing out and explaining: - The original lower boundary, the soft limit, is defined as a limit that is per default unset. As a result, the set of cgroups that global reclaim prefers is opt-in, rather than opt-out. The costs for optimizing these mostly negative lookups are so high that the implementation, despite its enormous size, does not even provide the basic desirable behavior. First off, the soft limit has no hierarchical meaning. All configured groups are organized in a global rbtree and treated like equal peers, regardless where they are located in the hierarchy. This makes subtree delegation impossible. Second, the soft limit reclaim pass is so aggressive that it not just introduces high allocation latencies into the system, but also impacts system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated reserve. A cgroup enjoys reclaim protection when it and all its ancestors are below their low boundaries, which makes delegation of subtrees possible. Secondly, new cgroups have no reserve per default and in the common case most cgroups are eligible for the preferred reclaim pass. This allows the new low boundary to be efficiently implemented with just a minor addition to the generic reclaim code, without the need for out-of-band data structures and reclaim passes. Because the generic reclaim code considers all cgroups except for the ones running low in the preferred first reclaim pass, overreclaim of individual groups is eliminated as well, resulting in much better overall workload performance. - The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. But this generally goes against the goal of making the most out of the available memory. The memory consumption of workloads varies during runtime, and that requires users to overcommit. But doing that with a strict upper limit requires either a fairly accurate prediction of the working set size or adding slack to the limit. Since working set size estimation is hard and error prone, and getting it wrong results in OOM kills, most users tend to err on the side of a looser limit and end up wasting precious resources. The memory.high boundary on the other hand can be set much more conservatively. When hit, it throttles allocations by forcing them into direct reclaim to work off the excess, but it never invokes the OOM killer. As a result, a high boundary that is chosen too aggressively will not terminate the processes, but instead it will lead to gradual performance degradation. The user can monitor this and make corrections until the minimal memory footprint that still gives acceptable performance is found. In extreme cases, with many concurrent allocations and a complete breakdown of reclaim progress within the group, the high boundary can be exceeded. But even then it's mostly better to satisfy the allocation from the slack available in other groups or the rest of the system than killing the group. Otherwise, memory.max is there to limit this type of spillover and ultimately contain buggy or even malicious applications. - The original control file names are unwieldy and inconsistent in many different ways. For example, the upper boundary hit count is exported in the memory.failcnt file, but an OOM event count has to be manually counted by listening to memory.oom_control events, and lower boundary / soft limit events have to be counted by first setting a threshold for that value and then counting those events. Also, usage and limit files encode their units in the filename. That makes the filenames very long, even though this is not information that a user needs to be reminded of every time they type out those names. To address these naming issues, as well as to signal clearly that the new interface carries a new configuration model, the naming conventions in it necessarily differ from the old interface. - The original limit files indicate the state of an unset limit with a very high number, and a configured limit can be unset by echoing -1 into those files. But that very high number is implementation and architecture dependent and not very descriptive. And while -1 can be understood as an underflow into the highest possible value, -2 or -10M etc. do not work, so it's not inconsistent. memory.low, memory.high, and memory.max will use the string "infinity" to indicate and set the highest possible value. [akpm@linux-foundation.org: use seq_puts() for basic strings] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/unified-hierarchy.txt | 79 ++++++++++ include/linux/memcontrol.h | 32 ++++ mm/memcontrol.c | 229 ++++++++++++++++++++++++++-- mm/vmscan.c | 22 ++- 4 files changed, 348 insertions(+), 14 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 4f4563277864..71daa35ec2d9 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -327,6 +327,85 @@ supported and the interface files "release_agent" and - use_hierarchy is on by default and the cgroup file for the flag is not created. +- The original lower boundary, the soft limit, is defined as a limit + that is per default unset. As a result, the set of cgroups that + global reclaim prefers is opt-in, rather than opt-out. The costs + for optimizing these mostly negative lookups are so high that the + implementation, despite its enormous size, does not even provide the + basic desirable behavior. First off, the soft limit has no + hierarchical meaning. All configured groups are organized in a + global rbtree and treated like equal peers, regardless where they + are located in the hierarchy. This makes subtree delegation + impossible. Second, the soft limit reclaim pass is so aggressive + that it not just introduces high allocation latencies into the + system, but also impacts system performance due to overreclaim, to + the point where the feature becomes self-defeating. + + The memory.low boundary on the other hand is a top-down allocated + reserve. A cgroup enjoys reclaim protection when it and all its + ancestors are below their low boundaries, which makes delegation of + subtrees possible. Secondly, new cgroups have no reserve per + default and in the common case most cgroups are eligible for the + preferred reclaim pass. This allows the new low boundary to be + efficiently implemented with just a minor addition to the generic + reclaim code, without the need for out-of-band data structures and + reclaim passes. Because the generic reclaim code considers all + cgroups except for the ones running low in the preferred first + reclaim pass, overreclaim of individual groups is eliminated as + well, resulting in much better overall workload performance. + +- The original high boundary, the hard limit, is defined as a strict + limit that can not budge, even if the OOM killer has to be called. + But this generally goes against the goal of making the most out of + the available memory. The memory consumption of workloads varies + during runtime, and that requires users to overcommit. But doing + that with a strict upper limit requires either a fairly accurate + prediction of the working set size or adding slack to the limit. + Since working set size estimation is hard and error prone, and + getting it wrong results in OOM kills, most users tend to err on the + side of a looser limit and end up wasting precious resources. + + The memory.high boundary on the other hand can be set much more + conservatively. When hit, it throttles allocations by forcing them + into direct reclaim to work off the excess, but it never invokes the + OOM killer. As a result, a high boundary that is chosen too + aggressively will not terminate the processes, but instead it will + lead to gradual performance degradation. The user can monitor this + and make corrections until the minimal memory footprint that still + gives acceptable performance is found. + + In extreme cases, with many concurrent allocations and a complete + breakdown of reclaim progress within the group, the high boundary + can be exceeded. But even then it's mostly better to satisfy the + allocation from the slack available in other groups or the rest of + the system than killing the group. Otherwise, memory.max is there + to limit this type of spillover and ultimately contain buggy or even + malicious applications. + +- The original control file names are unwieldy and inconsistent in + many different ways. For example, the upper boundary hit count is + exported in the memory.failcnt file, but an OOM event count has to + be manually counted by listening to memory.oom_control events, and + lower boundary / soft limit events have to be counted by first + setting a threshold for that value and then counting those events. + Also, usage and limit files encode their units in the filename. + That makes the filenames very long, even though this is not + information that a user needs to be reminded of every time they type + out those names. + + To address these naming issues, as well as to signal clearly that + the new interface carries a new configuration model, the naming + conventions in it necessarily differ from the old interface. + +- The original limit files indicate the state of an unset limit with a + Very High Number, and a configured limit can be unset by echoing -1 + into those files. But that very high number is implementation and + architecture dependent and not very descriptive. And while -1 can + be understood as an underflow into the highest possible value, -2 or + -10M etc. do not work, so it's not consistent. + + memory.low, memory.high, and memory.max will use the string + "infinity" to indicate and set the highest possible value. 5. Planned Changes diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 353537a5981a..6cfd934c7c9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,7 +52,27 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ + MEM_CGROUP_EVENTS_NSTATS, + /* default hierarchy events */ + MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS, + MEMCG_HIGH, + MEMCG_MAX, + MEMCG_OOM, + MEMCG_NR_EVENTS, +}; + #ifdef CONFIG_MEMCG +void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr); + +bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); + int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -175,6 +195,18 @@ void mem_cgroup_split_huge_fixup(struct page *head); #else /* CONFIG_MEMCG */ struct mem_cgroup; +static inline void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr) +{ +} + +static inline bool mem_cgroup_low(struct mem_cgroup *root, + struct mem_cgroup *memcg) +{ + return false; +} + static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6453ea5a27aa..ee97c9ac62c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -97,14 +97,6 @@ static const char * const mem_cgroup_stat_names[] = { "swap", }; -enum mem_cgroup_events_index { - MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ - MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ - MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ - MEM_CGROUP_EVENTS_NSTATS, -}; - static const char * const mem_cgroup_events_names[] = { "pgpgin", "pgpgout", @@ -138,7 +130,7 @@ enum mem_cgroup_events_target { struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; - unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long events[MEMCG_NR_EVENTS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -284,6 +276,10 @@ struct mem_cgroup { struct page_counter memsw; struct page_counter kmem; + /* Normal memory consumption range */ + unsigned long low; + unsigned long high; + unsigned long soft_limit; /* vmpressure notifications */ @@ -2315,6 +2311,8 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; + mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -2356,6 +2354,8 @@ retry: if (fatal_signal_pending(current)) goto bypass; + mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2367,6 +2367,16 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + /* + * If the hierarchy is above the normal consumption range, + * make the charging task trim their excess contribution. + */ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); done: return ret; } @@ -4276,7 +4286,7 @@ out_kfree: return ret; } -static struct cftype mem_cgroup_files[] = { +static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), @@ -4552,6 +4562,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent_css == NULL) { root_mem_cgroup = memcg; page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); @@ -4597,6 +4608,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent->use_hierarchy) { page_counter_init(&memcg->memory, &parent->memory); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); @@ -4607,6 +4619,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) */ } else { page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); @@ -4682,6 +4695,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + memcg->low = 0; + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; } @@ -5267,6 +5282,147 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) mem_cgroup_from_css(root_css)->use_hierarchy = true; } +static u64 memory_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_usage(mem_cgroup_from_css(css), false); +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long low = ACCESS_ONCE(memcg->low); + + if (low == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &low); + if (err) + return err; + + memcg->low = low; + + return nbytes; +} + +static int memory_high_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long high = ACCESS_ONCE(memcg->high); + + if (high == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &high); + if (err) + return err; + + memcg->high = high; + + return nbytes; +} + +static int memory_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = ACCESS_ONCE(memcg->memory.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &max); + if (err) + return err; + + err = mem_cgroup_resize_limit(memcg, max); + if (err) + return err; + + return nbytes; +} + +static int memory_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); + + return 0; +} + +static struct cftype memory_files[] = { + { + .name = "current", + .read_u64 = memory_current_read, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_max_show, + .write = memory_max_write, + }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_events_show, + }, + { } /* terminate */ +}; + struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, @@ -5277,7 +5433,8 @@ struct cgroup_subsys memory_cgrp_subsys = { .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, - .legacy_cftypes = mem_cgroup_files, + .dfl_cftypes = memory_files, + .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, }; @@ -5312,6 +5469,56 @@ static void __init enable_swap_cgroup(void) } #endif +/** + * mem_cgroup_events - count memory events against a cgroup + * @memcg: the memory cgroup + * @idx: the event index + * @nr: the number of events to account for + */ +void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr) +{ + this_cpu_add(memcg->stat->events[idx], nr); +} + +/** + * mem_cgroup_low - check if memory consumption is below the normal range + * @root: the highest ancestor to consider + * @memcg: the memory cgroup to check + * + * Returns %true if memory consumption of @memcg, and that of all + * configurable ancestors up to @root, is below the normal range. + */ +bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return false; + + /* + * The toplevel group doesn't have a configurable range, so + * it's never low when looked at directly, and it is not + * considered an ancestor when assessing the hierarchy. + */ + + if (memcg == root_mem_cgroup) + return false; + + if (page_counter_read(&memcg->memory) > memcg->low) + return false; + + while (memcg != root) { + memcg = parent_mem_cgroup(memcg); + + if (memcg == root_mem_cgroup) + break; + + if (page_counter_read(&memcg->memory) > memcg->low) + return false; + } + return true; +} + #ifdef CONFIG_MEMCG_SWAP /** * mem_cgroup_swapout - transfer a memsw charge to swap diff --git a/mm/vmscan.c b/mm/vmscan.c index b6dfa0081a8e..8e645ee52045 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -91,6 +91,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Can cgroups be reclaimed below their normal consumption range? */ + unsigned int may_thrash:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -2294,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, struct lruvec *lruvec; int swappiness; + if (mem_cgroup_low(root, memcg)) { + if (!sc->may_thrash) + continue; + mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); @@ -2315,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); /* * Shrink the slab caches in the same proportion that @@ -2519,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { + int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; bool zones_reclaimable; - +retry: delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2572,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->compaction_ready) return 1; + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (!sc->may_thrash) { + sc->priority = initial_priority; + sc->may_thrash = 1; + goto retry; + } + /* Any of the zones still reclaimable? Don't OOM. */ if (zones_reclaimable) return 1; -- cgit v1.2.3