From f3a53a3a1e5b3b9bc114b5b7930fbe89d9ffed5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:35 -0700 Subject: mm, memcontrol: implement memory.swap.events Add swap max and fail events so that userland can monitor and respond to running out of swap. I'm not too sure about the fail event. Right now, it's a bit confusing which stats / events are recursive and which aren't and also which ones reflect events which originate from a given cgroup and which targets the cgroup. No idea what the right long term solution is and it could just be that growing them organically is actually the only right thing to do. Link: http://lkml.kernel.org/r/20180416231151.GI1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71bc2c66..517096c3cc99 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum memcg_memory_event { MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, + MEMCG_SWAP_MAX, + MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; @@ -208,6 +210,9 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; + /* handle for "memory.swap.events" */ + struct cgroup_file swap_events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; -- cgit v1.2.3 From bbec2e15170aae3e084d7d9afc730aeebe01b654 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:18 -0700 Subject: mm: rename page_counter's count/limit into usage/max This patch renames struct page_counter fields: count -> usage limit -> max and the corresponding functions: page_counter_limit() -> page_counter_set_max() mem_cgroup_get_limit() -> mem_cgroup_get_max() mem_cgroup_resize_limit() -> mem_cgroup_resize_max() memcg_update_kmem_limit() -> memcg_update_kmem_max() memcg_update_tcp_limit() -> memcg_update_tcp_max() The idea behind this renaming is to have the direct matching between memory cgroup knobs (low, high, max) and page_counters API. This is pure renaming, this patch doesn't bring any functional change. Link: http://lkml.kernel.org/r/20180405185921.4942-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 +- include/linux/page_counter.h | 12 ++--- mm/hugetlb_cgroup.c | 6 +-- mm/memcontrol.c | 112 +++++++++++++++++++++---------------------- mm/oom_kill.c | 2 +- mm/page_counter.c | 28 +++++------ 6 files changed, 82 insertions(+), 82 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 517096c3cc99..577a19a6a93b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -467,7 +467,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, void mem_cgroup_handle_over_high(void); -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -858,7 +858,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } -static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c15ab80ad32d..94029dad9317 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,8 +7,8 @@ #include struct page_counter { - atomic_long_t count; - unsigned long limit; + atomic_long_t usage; + unsigned long max; struct page_counter *parent; /* legacy */ @@ -25,14 +25,14 @@ struct page_counter { static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { - atomic_long_set(&counter->count, 0); - counter->limit = PAGE_COUNTER_MAX; + atomic_long_set(&counter->usage, 0); + counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { - return atomic_long_read(&counter->count); + return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); @@ -41,7 +41,7 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..68c2f2f3c05b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d86665cf4a49..79bb5aeaa800 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2444,10 +2444,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; int ret; @@ -2460,22 +2460,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; @@ -2757,7 +2757,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +2871,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +2913,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +2941,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3126,8 +3126,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3626,7 +3626,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5131,7 +5131,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5155,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6074,7 +6074,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6095,7 +6095,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6129,7 +6129,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6151,9 +6151,9 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + err = page_counter_set_max(&memcg->swap, max); + mutex_unlock(&memcg_max_mutex); if (err) return err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ba6cb88cf58..6694348b27e9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3ad60a4..41937c9a9d11 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -22,7 +22,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +41,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +82,9 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -123,20 +123,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,17 +149,17 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } -- cgit v1.2.3 From 230671533d64631116be3ff9d407bd9ca5a58e1b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:22 -0700 Subject: mm: memory.low hierarchical behavior This patch aims to address an issue in current memory.low semantics, which makes it hard to use it in a hierarchy, where some leaf memory cgroups are more valuable than others. For example, there are memcgs A, A/B, A/C, A/D and A/E: A A/memory.low = 2G, A/memory.current = 6G //\\ BC DE B/memory.low = 3G B/memory.current = 2G C/memory.low = 1G C/memory.current = 2G D/memory.low = 0 D/memory.current = 2G E/memory.low = 10G E/memory.current = 0 If we apply memory pressure, B, C and D are reclaimed at the same pace while A's usage exceeds 2G. This is obviously wrong, as B's usage is fully below B's memory.low, and C has 1G of protection as well. Also, A is pushed to the size, which is less than A's 2G memory.low, which is also wrong. A simple bash script (provided below) can be used to reproduce the problem. Current results are: A: 1430097920 A/B: 711929856 A/C: 717426688 A/D: 741376 A/E: 0 To address the issue a concept of effective memory.low is introduced. Effective memory.low is always equal or less than original memory.low. In a case, when there is no memory.low overcommittment (and also for top-level cgroups), these two values are equal. Otherwise it's a part of parent's effective memory.low, calculated as a cgroup's memory.low usage divided by sum of sibling's memory.low usages (under memory.low usage I mean the size of actually protected memory: memory.current if memory.current < memory.low, 0 otherwise). It's necessary to track the actual usage, because otherwise an empty cgroup with memory.low set (A/E in my example) will affect actual memory distribution, which makes no sense. To avoid traversing the cgroup tree twice, page_counters code is reused. Calculating effective memory.low can be done in the reclaim path, as we conveniently traversing the cgroup tree from top to bottom and check memory.low on each level. So, it's a perfect place to calculate effective memory low and save it to use it for children cgroups. This also eliminates a need to traverse the cgroup tree from bottom to top each time to check if parent's guarantee is not exceeded. Setting/resetting effective memory.low is intentionally racy, but it's fine and shouldn't lead to any significant differences in actual memory distribution. With this patch applied results are matching the expectations: A: 2147930112 A/B: 1428721664 A/C: 718393344 A/D: 815104 A/E: 0 Test script: #!/bin/bash CGPATH="/sys/fs/cgroup" truncate /file1 --size 2G truncate /file2 --size 2G truncate /file3 --size 2G truncate /file4 --size 50G mkdir "${CGPATH}/A" echo "+memory" > "${CGPATH}/A/cgroup.subtree_control" mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" echo 2G > "${CGPATH}/A/memory.low" echo 3G > "${CGPATH}/A/B/memory.low" echo 1G > "${CGPATH}/A/C/memory.low" echo 0 > "${CGPATH}/A/D/memory.low" echo 10G > "${CGPATH}/A/E/memory.low" echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1 echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2 echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3 echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4 echo "A: " `cat "${CGPATH}/A/memory.current"` echo "A/B: " `cat "${CGPATH}/A/B/memory.current"` echo "A/C: " `cat "${CGPATH}/A/C/memory.current"` echo "A/D: " `cat "${CGPATH}/A/D/memory.current"` echo "A/E: " `cat "${CGPATH}/A/E/memory.current"` rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" rmdir "${CGPATH}/A" rm /file1 /file2 /file3 /file4 Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +- include/linux/page_counter.h | 7 +++ mm/memcontrol.c | 112 ++++++++++++++++++++++++++++++++----------- mm/page_counter.c | 43 +++++++++++++++++ 4 files changed, 134 insertions(+), 31 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 577a19a6a93b..891945507044 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,8 +181,7 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Normal memory consumption range */ - unsigned long low; + /* Upper bound of normal memory consumption range */ unsigned long high; /* Range enforcement for interrupt charges */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 94029dad9317..7902a727d3b6 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,8 +9,14 @@ struct page_counter { atomic_long_t usage; unsigned long max; + unsigned long low; struct page_counter *parent; + /* effective memory.low and memory.low usage tracking */ + unsigned long elow; + atomic_long_t low_usage; + atomic_long_t children_low_usage; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79bb5aeaa800..d0261059aab9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4270,7 +4270,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg->low = 0; page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5064,7 +5064,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5086,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5348,36 +5348,72 @@ struct cgroup_subsys memory_cgrp_subsys = { * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. + * Returns %true if memory consumption of @memcg is below the normal range. * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. + * @root is exclusive; it is never low when looked at directly * - * For example, given cgroup A with children B and C: + * To provide a proper hierarchical behavior, effective memory.low value + * is used. * - * A - * / \ - * B C + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. * - * and + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. + * + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: + * + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 + * + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): + * + * A/memory.current = 2G + * + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_low_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_low() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { + unsigned long usage, low_usage, siblings_low_usage; + unsigned long elow, parent_elow; + struct mem_cgroup *parent; + if (mem_cgroup_disabled()) return false; @@ -5386,12 +5422,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) if (memcg == root) return false; - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; - } + elow = memcg->memory.low; + usage = page_counter_read(&memcg->memory); + parent = parent_mem_cgroup(memcg); - return true; + if (parent == root) + goto exit; + + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + + if (!elow || !parent_elow) + goto exit; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (!low_usage || !siblings_low_usage) + goto exit; + + elow = min(elow, parent_elow * low_usage / siblings_low_usage); +exit: + memcg->memory.elow = elow; + return usage < elow; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index 41937c9a9d11..a5ff4cbc355a 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,28 @@ #include #include +static void propagate_low_usage(struct page_counter *c, unsigned long usage) +{ + unsigned long low_usage, old; + long delta; + + if (!c->parent) + return; + + if (!c->low && !atomic_long_read(&c->low_usage)) + return; + + if (usage <= c->low) + low_usage = usage; + else + low_usage = 0; + + old = atomic_long_xchg(&c->low_usage, low_usage); + delta = low_usage - old; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -23,6 +45,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_low_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -42,6 +65,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -85,6 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_low_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -164,6 +190,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_low_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse -- cgit v1.2.3 From 9ccc361716362e8abb01d9944f8df97b4aac3e23 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 7 Jun 2018 17:07:19 -0700 Subject: memcg: writeback: use memcg->cgwb_list directly mem_cgroup_cgwb_list is a very simple wrapper and it will never be used outside of code under CONFIG_CGROUP_WRITEBACK. so use memcg->cgwb_list directly. Link: http://lkml.kernel.org/r/1524406173-212182-1-git-send-email-wanglong19@meituan.com Signed-off-by: Wang Long Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - mm/backing-dev.c | 4 ++-- mm/memcontrol.c | 5 ----- 3 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 891945507044..10d741e8fe51 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1097,7 +1097,6 @@ static inline void dec_lruvec_page_state(struct page *page, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fe3ebd6ac00..347cc834c04a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); - memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ @@ -736,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8bab4660e6aa..2751259f0a76 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3562,11 +3562,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -{ - return &memcg->cgwb_list; -} - static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); -- cgit v1.2.3 From bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:07:46 -0700 Subject: memcg: introduce memory.min Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [guro@fb.com: s/low/min/ in docs] Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com Signed-off-by: Roman Gushchin Reviewed-by: Randy Dunlap Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 27 +++++++- include/linux/memcontrol.h | 15 ++-- include/linux/page_counter.h | 11 ++- mm/memcontrol.c | 118 +++++++++++++++++++++++++------- mm/page_counter.c | 63 ++++++++++++----- mm/vmscan.c | 18 ++++- 6 files changed, 202 insertions(+), 50 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7b56ca80e37a..e34d3c938729 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back. The total amount of memory currently being used by the cgroup and its descendants. + memory.min + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Hard memory protection. If the memory usage of a cgroup + is within its effective min boundary, the cgroup's memory + won't be reclaimed under any conditions. If there is no + unprotected reclaimable memory available, OOM killer + is invoked. + + Effective min boundary is limited by memory.min values of + all ancestor cgroups. If there is memory.min overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.min. + + Putting more memory than generally available under this + protection is discouraged and may lead to constant OOMs. + + If a memory cgroup is not populated with processes, + its memory.min is ignored. + memory.low A read-write single value file which exists on non-root cgroups. The default is "0". @@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment - (child cgroup or cgroups are requiring more protected memory, + (child cgroup or cgroups are requiring more protected memory than parent will allow), then each child cgroup will get - the part of parent's protection proportional to the its + the part of parent's protection proportional to its actual memory usage below memory.low. Putting more memory than generally available under this diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 10d741e8fe51..9c04cf8e6487 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,12 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7902a727d3b6..bab7e57f659b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -8,10 +8,16 @@ struct page_counter { atomic_long_t usage; - unsigned long max; + unsigned long min; unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; @@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6de0d6a3a8d..e3d56927a724 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); @@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, @@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is in the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is in the normal range. + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected * - * @root is exclusive; it is never low when looked at directly + * @root is exclusive; it is never protected when looked at directly * - * To provide a proper hierarchical behavior, effective memory.low value - * is used. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * * Effective memory.low is always equal or less than the original memory.low. * If there is no memory.low overcommittment (which is always true for @@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = { * E/memory.current = 0 * * These calculations require constant tracking of the actual low usages - * (see propagate_low_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_low() + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() * path for each memory cgroup top-down from the reclaim, * it's possible to optimize this part, and save calculated elow * for next usage. This part is intentionally racy, but it's ok, * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { - unsigned long usage, low_usage, siblings_low_usage; - unsigned long elow, parent_elow; struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; - elow = memcg->memory.low; usage = page_counter_read(&memcg->memory); - parent = parent_mem_cgroup(memcg); + if (!usage) + return MEMCG_PROT_NONE; + + emin = memcg->memory.min; + elow = memcg->memory.low; + parent = parent_mem_cgroup(memcg); if (parent == root) goto exit; + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); + } + parent_elow = READ_ONCE(parent->memory.elow); elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; - if (!elow || !parent_elow) - goto exit; + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); - - if (!low_usage || !siblings_low_usage) - goto exit; + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } - elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: + memcg->memory.emin = emin; memcg->memory.elow = elow; - return usage && usage <= elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index a5ff4cbc355a..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,26 +13,38 @@ #include #include -static void propagate_low_usage(struct page_counter *c, unsigned long usage) +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) { - unsigned long low_usage, old; + unsigned long protected, old_protected; long delta; if (!c->parent) return; - if (!c->low && !atomic_long_read(&c->low_usage)) - return; + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } - if (usage <= c->low) - low_usage = usage; - else - low_usage = 0; + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; - old = atomic_long_xchg(&c->low_usage, low_usage); - delta = low_usage - old; - if (delta) - atomic_long_add(delta, &c->parent->children_low_usage); + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } } /** @@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_set_low - set the amount of protected memory * @counter: counter @@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) counter->low = nr_pages; for (c = counter; c; c = c->parent) - propagate_low_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e67b477ecef..03822f86f288 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; -- cgit v1.2.3 From e81bf9793b1861d74953ef041b4f6c7faecc2dbd Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 7 Jun 2018 17:09:44 -0700 Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline The LKP robot found a 27% will-it-scale/page_fault3 performance regression regarding commit e27be240df53("mm: memcg: make sure memory.events is uptodate when waking pollers"). What the test does is: 1 mkstemp() a 128M file on a tmpfs; 2 start $nr_cpu processes, each to loop the following: 2.1 mmap() this file in shared write mode; 2.2 write 0 to this file in a PAGE_SIZE step till the end of the file; 2.3 unmap() this file and repeat this process. 3 After 5 minutes, check how many loops they managed to complete, the higher the better. The commit itself looks innocent enough as it merely changed some event counting mechanism and this test didn't trigger those events at all. Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu in count_memcg_event_mm()(called by handle_mm_fault()) and in __mod_memcg_state() called by page_add_file_rmap(). So it's likely due to the changed layout of 'struct mem_cgroup' that either make stat_cpu falling into a constantly modifying cacheline or some hot fields stop being in the same cacheline. I verified this by moving memory_events[] back to where it was: : --- a/include/linux/memcontrol.h : +++ b/include/linux/memcontrol.h : @@ -205,7 +205,6 @@ struct mem_cgroup { : int oom_kill_disable; : : /* memory.events */ : - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; : struct cgroup_file events_file; : : /* protect arrays of thresholds */ : @@ -238,6 +237,7 @@ struct mem_cgroup { : struct mem_cgroup_stat_cpu __percpu *stat_cpu; : atomic_long_t stat[MEMCG_NR_STAT]; : atomic_long_t events[NR_VM_EVENT_ITEMS]; : + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; : : unsigned long socket_pressure; And performance restored. Later investigation found that as long as the following 3 fields moving_account, move_lock_task and stat_cpu are in the same cacheline, performance will be good. To avoid future performance surprise by other commits changing the layout of 'struct mem_cgroup', this patch makes sure the 3 fields stay in the same cacheline. One concern of this approach is, moving_account and move_lock_task could be modified when a process changes memory cgroup while stat_cpu is a always read field, it might hurt to place them in the same cacheline. I assume it is rare for a process to change memory cgroup so this should be OK. Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com Signed-off-by: Aaron Lu Reported-by: kernel test robot Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9c04cf8e6487..4f52ec755725 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -166,6 +166,15 @@ enum memcg_kmem_state { KMEM_ONLINE, }; +#if defined(CONFIG_SMP) +struct memcg_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define MEMCG_PADDING(name) struct memcg_padding name; +#else +#define MEMCG_PADDING(name) +#endif + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -212,7 +221,6 @@ struct mem_cgroup { int oom_kill_disable; /* memory.events */ - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; /* handle for "memory.swap.events" */ @@ -235,19 +243,26 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + MEMCG_PADDING(_pad1_); + /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; struct task_struct *move_lock_task; - unsigned long move_lock_flags; /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; + + MEMCG_PADDING(_pad2_); + atomic_long_t stat[MEMCG_NR_STAT]; atomic_long_t events[NR_VM_EVENT_ITEMS]; + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; -- cgit v1.2.3